diff --git a/CMakeLists.txt b/CMakeLists.txt index c105f802a..2755c5f2b 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -130,17 +130,30 @@ endif() list(SORT build_kernel) message(STATUS "build_kernel:[${build_kernel}]") - -foreach(kernel ${build_kernel}) - if (NOT IS_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/kernels/${kernel}") + +file(GLOB all_kernels "${CMAKE_CURRENT_LIST_DIR}/kernels/*") +foreach(kernel ${build_kernel} ) + set(kernel_parent_dir '') + foreach (o ${all_kernels}) + if (IS_DIRECTORY ${o}) + get_filename_component(kernelname ${o} NAME) + if(IS_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/kernels/${kernel} OR + IS_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/kernels/${kernelname}/${kernel}) + set(kernel_parent_dir ${kernelname}) + endif() + endif() + endforeach () + if (IS_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/kernels/${kernel}") + file(GLOB_RECURSE src_files ${src_files} "${CMAKE_CURRENT_SOURCE_DIR}/kernels/${kernel}/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/${kernel}/*.mlu") + elseif(IS_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/kernels/${kernel_parent_dir}/${kernel}") + file(GLOB_RECURSE src_files ${src_files} "${CMAKE_CURRENT_SOURCE_DIR}/kernels/${kernel_parent_dir}/${kernel}/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/${kernel_parent_dir}/${kernel}/*.mlu") + else() message(WARNING "kernel/${kernel} is not a directory, ${kernel} is an alias") continue() endif() - file(GLOB_RECURSE src_files ${src_files} "${CMAKE_CURRENT_SOURCE_DIR}/kernels/${kernel}/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/${kernel}/*.mlu") - file(GLOB_RECURSE src_helper_files ${src_helper_files} "${CMAKE_CURRENT_SOURCE_DIR}/kernels/utils/cnnl_helper.cpp") - file(GLOB_RECURSE arch_binary_files ${arch_binary_files} "${CMAKE_CURRENT_SOURCE_DIR}/kernels/${kernel}/${MLUOP_TARGET_CPU_ARCH}/*.o") endforeach() - + +file(GLOB_RECURSE src_helper_files ${src_helper_files} "${CMAKE_CURRENT_SOURCE_DIR}/kernels/utils/cnnl_helper.cpp") file(GLOB_RECURSE core_src_files ${core_src_files} "${CMAKE_CURRENT_SOURCE_DIR}/core/*.cpp") # set(src_files ${src_files} "${CMAKE_CURRENT_SOURCE_DIR}/test/main.cpp") diff --git a/kernel_depends.toml b/kernel_depends.toml index 7dc5a0441..a325f0c76 100755 --- a/kernel_depends.toml +++ b/kernel_depends.toml @@ -41,5 +41,6 @@ deform_roi_pool_forward = ["deform_roi_pool"] deform_roi_pool_backward = ["deform_roi_pool"] carafe_forward = ["carafe"] carafe_backward = ["carafe"] -dcn_backward_weight = ["dcn_forward"] -dcn_backward_data = ["dcn_forward"] +dcn_backward_weight = ["dcn_common"] +dcn_backward_data = ["dcn_common"] +dcn_forward = ["dcn_common"] diff --git a/kernels/border_align_backward/border_align_backward.cpp b/kernels/border_align_backward/border_align_backward.cpp deleted file mode 100644 index 1dadcedd0..000000000 --- a/kernels/border_align_backward/border_align_backward.cpp +++ /dev/null @@ -1,161 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "border_align_backward.h" - -#include - -#include "core/context.h" -#include "core/gen_case.h" -#include "core/logging.h" -#include "core/runtime/device.h" -#include "core/tensor.h" -#include "core/type.h" -#include "core/tool.h" -#include "kernels/kernel.h" -#include "kernels/utils/cnnl_helper.h" - -static void policyFunc(mluOpHandle_t handle, cnrtDim3_t *k_dim, - cnrtFunctionType_t *k_type) { - *k_type = CNRT_FUNC_TYPE_UNION1; - k_dim->x = mluop::runtime::getCoreNumOfEachUnionCapability(handle); - k_dim->y = mluop::runtime::getClusterLimitCapability(handle); - k_dim->z = 1; -} - -mluOpStatus_t mluOpBorderAlignBackward( - mluOpHandle_t handle, const mluOpTensorDescriptor_t grad_output_desc, - const void *grad_output, const mluOpTensorDescriptor_t boxes_desc, - const void *boxes, const mluOpTensorDescriptor_t argmax_idx_desc, - const void *argmax_idx, const int32_t pool_size, - const mluOpTensorDescriptor_t grad_input_desc, void *grad_input) { - const std::string API = "[mluOpBorderAlignBackward]"; - // params check - PARAM_CHECK(API, handle != nullptr); - PARAM_CHECK(API, grad_output_desc != nullptr); - PARAM_CHECK(API, boxes_desc != nullptr); - PARAM_CHECK(API, argmax_idx_desc != nullptr); - PARAM_CHECK(API, grad_input_desc != nullptr); - - PARAM_CHECK(API, grad_output_desc->dim == 4); - PARAM_CHECK(API, boxes_desc->dim == 3); - PARAM_CHECK(API, argmax_idx_desc->dim == 4); - PARAM_CHECK(API, grad_input_desc->dim == 4); - - const int32_t border_num = 4; - const int32_t coord_num = 4; - const int32_t origin_n = grad_input_desc->dims[0]; - const int32_t origin_h = grad_input_desc->dims[1]; - const int32_t origin_w = grad_input_desc->dims[2]; - const int32_t origin_c = grad_input_desc->dims[3] / border_num; - const int32_t origin_k = boxes_desc->dims[1]; - - PARAM_CHECK(API, grad_output_desc->dtype == MLUOP_DTYPE_FLOAT || - grad_output_desc->dtype == MLUOP_DTYPE_HALF); - PARAM_CHECK(API, argmax_idx_desc->dtype == MLUOP_DTYPE_INT32); - PARAM_CHECK(API, boxes_desc->dtype == grad_output_desc->dtype); - PARAM_CHECK(API, grad_input_desc->dtype == grad_output_desc->dtype); - - PARAM_CHECK(API, grad_output_desc->layout == MLUOP_LAYOUT_NHWC); - PARAM_CHECK(API, argmax_idx_desc->layout == MLUOP_LAYOUT_NHWC); - PARAM_CHECK(API, grad_input_desc->layout == MLUOP_LAYOUT_NHWC); - - PARAM_CHECK(API, grad_input_desc->dims[3] % border_num == 0); - PARAM_CHECK_NE(API, origin_n, 0); - PARAM_CHECK_NE(API, origin_c, 0); - PARAM_CHECK_NE(API, origin_h, 0); - PARAM_CHECK_NE(API, origin_w, 0); - PARAM_CHECK(API, origin_h * origin_w == origin_k); - PARAM_CHECK(API, boxes_desc->dim == 3); - PARAM_CHECK(API, boxes_desc->dims[2] == coord_num); - PARAM_CHECK_NE(API, origin_k, 0); - PARAM_CHECK_GT(API, pool_size, 0); - - PARAM_CHECK_EQ(API, grad_output_desc->dims[0], origin_n); - PARAM_CHECK_EQ(API, grad_output_desc->dims[1], origin_k); - PARAM_CHECK_EQ(API, grad_output_desc->dims[2], border_num); - PARAM_CHECK_EQ(API, grad_output_desc->dims[3], origin_c); - - PARAM_CHECK_EQ(API, boxes_desc->dims[0], origin_n); - PARAM_CHECK_EQ(API, boxes_desc->dims[1], origin_k); - PARAM_CHECK_EQ(API, boxes_desc->dims[2], coord_num); - - PARAM_CHECK_EQ(API, argmax_idx_desc->dims[0], origin_n); - PARAM_CHECK_EQ(API, argmax_idx_desc->dims[1], origin_k); - PARAM_CHECK_EQ(API, argmax_idx_desc->dims[2], border_num); - PARAM_CHECK_EQ(API, argmax_idx_desc->dims[3], origin_c); - - TENSOR_NUM_CHECK(API, mluOpGetTensorElementNum(grad_output_desc), - LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK(API, mluOpGetTensorElementNum(boxes_desc), LARGE_TENSOR_NUM, - ""); - TENSOR_NUM_CHECK(API, mluOpGetTensorElementNum(grad_input_desc), - LARGE_TENSOR_NUM, ""); - - PARAM_CHECK(API, grad_output != nullptr); - PARAM_CHECK(API, boxes != nullptr); - PARAM_CHECK(API, argmax_idx != nullptr); - PARAM_CHECK(API, grad_input != nullptr); - - // generate case prototxt - if (MLUOP_GEN_CASE_ON_NEW) { - GEN_CASE_START("border_align_backward"); - GEN_CASE_HANDLE(handle); - GEN_CASE_DATA(true, "input1", grad_output, grad_output_desc, 100, 0); - GEN_CASE_DATA_REAL(true, "input2", boxes, boxes_desc); - GEN_CASE_DATA_REAL(true, "input3", argmax_idx, argmax_idx_desc); - GEN_CASE_DATA(false, "output1", grad_input, grad_input_desc, 0, 0); - GEN_CASE_OP_PARAM_SINGLE(0, "border_align_backward", "pool_size", - pool_size); - GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0); - } - - cnrtDim3_t k_dim; - cnrtFunctionType_t k_type; - policyFunc(handle, &k_dim, &k_type); - - VLOG(5) << "[mluOpBorderAlignBackward] cnnlFill_v3 start."; - uint64_t fill_value = 0x0; - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_input_desc, - cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value, - cnnl_output_desc, grad_input)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - VLOG(5) << "[mluOpBorderAlignBackward] cnnlFill_v3 end."; - mluOpDataType_t input_dtype = grad_output_desc->dtype; - - VLOG(5) << "Launch Kernel KernelBorderAlignBackward<<>>"; - CHECK_RETURN( - API, KernelBorderAlignBackward( - k_dim, k_type, handle->queue, input_dtype, (void *)grad_output, - (void *)boxes, (int32_t *)argmax_idx, pool_size, origin_n, - origin_h, origin_w, origin_c, origin_k, (void *)grad_input)); - - GEN_CASE_END(); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/border_align_backward/border_align_backward.h b/kernels/border_align_backward/border_align_backward.h deleted file mode 100644 index 316e20f82..000000000 --- a/kernels/border_align_backward/border_align_backward.h +++ /dev/null @@ -1,34 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#ifndef KERNELS_BORDER_ALIGN_BACKWARD_BORDER_ALIGN_BACKWARD_H_ -#define KERNELS_BORDER_ALIGN_BACKWARD_BORDER_ALIGN_BACKWARD_H_ - -#include "mlu_op.h" - -mluOpStatus_t MLUOP_WIN_API KernelBorderAlignBackward( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t d_type, const void *grad_output, const void *boxes, - const int32_t *argmax_idx, const int32_t pool_size, const int32_t origin_n, - const int32_t origin_h, const int32_t origin_w, const int32_t origin_c, - const int32_t origin_k, void *grad_input); -#endif // KERNELS_BORDER_ALIGN_BACKWARD_BORDER_ALIGN_BACKWARD_H_ diff --git a/kernels/border_align_backward/border_align_backward_union1.mlu b/kernels/border_align_backward/border_align_backward_union1.mlu deleted file mode 100644 index 61c23ad01..000000000 --- a/kernels/border_align_backward/border_align_backward_union1.mlu +++ /dev/null @@ -1,311 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "border_align_backward.h" - -#include "core/logging.h" -#include "kernels/debug.h" -#include "kernels/kernel.h" -#include "kernels/utils/common.h" - -__nram__ char nram_buffer[MAX_NRAM_SIZE]; - -#define BORDER_NUM 4 -#define CALCULATE_GRAD_INPUT(w, x, y) \ - const int32_t offset_##w = n * origin_h * origin_w * origin_c * BORDER_NUM + \ - y * origin_w * origin_c * BORDER_NUM + \ - x * origin_c * BORDER_NUM + border * origin_c + \ - c; \ - __bang_mul_scalar(nram_grad_input, nram_grad_output, w, deal_num_align); \ - __bang_band((char *)nram_grad_input, (char *)nram_grad_input, (char *)mask, \ - sizeof(T) * deal_num_align); \ - __bang_atomic_reduce_add(grad_input + offset_##w, nram_grad_input, deal_num); - -template -__mlu_func__ void computeGradInput( - T *nram_grad_input, T *nram_grad_output, T *grad_input, T *mask, const T w1, - const T w2, const T w3, const T w4, const int32_t x_low, - const int32_t y_low, const int32_t x_high, const int32_t y_high, - const int32_t origin_c, const int32_t c, const int32_t origin_w, - const int32_t n, const int32_t origin_h, const int32_t border, - const int32_t deal_num, const int32_t deal_num_align) { - /* bilinear-interpolation: - * v1 = input_HW[y_low, x_low] - * v2 = input_HW[y_low, x_high] - * v3 = input_HW[y_high, x_low] - * v4 = input_HW[y_high, x_high] - * - * forward: - * output_value = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4 - * backwrad: - * v1.atomicAdd(grad_output_value * w1) - * ... - * v4.atomicAdd(grad_output_value * w4) - */ - CALCULATE_GRAD_INPUT(w1, x_low, y_low); - CALCULATE_GRAD_INPUT(w2, x_high, y_low); - CALCULATE_GRAD_INPUT(w3, x_low, y_high); - CALCULATE_GRAD_INPUT(w4, x_high, y_high); -} - -template -__mlu_func__ void bilinearInterpolate(const int32_t input_height, - const int32_t input_width, T y, T x, - T *w1, T *w2, T *w3, T *w4, - int32_t *x_low, int32_t *x_high, - int32_t *y_low, int32_t *y_high, - bool *empty) { - // deal with case that the point is out of feature map boundary - if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) { - *empty = true; - *w1 = *w2 = *w3 = *w4 = 0; - *x_low = *x_high = *y_low = *y_high = -1; - return; - } - *empty = false; - if (y <= 0) y = (T)0; - if (x <= 0) x = (T)0; - - *y_low = int32_t(y); - *x_low = int32_t(x); - - if (*y_low >= input_height - 1) { - *y_high = *y_low = input_height - 1; - y = (T)(*y_low); - } else { - *y_high = *y_low + 1; - } - - if (*x_low >= input_width - 1) { - *x_high = *x_low = input_width - 1; - x = T(*x_low); - } else { - *x_high = *x_low + 1; - } - T ly = y - *y_low; - T lx = x - *x_low; - T hy = 1.0 - ly; - T hx = 1.0 - lx; - *w1 = hy * hx; - *w2 = hy * lx; - *w3 = ly * hx; - *w4 = ly * lx; -} - -template -__mlu_func__ void computeImpl(T *nram_grad_output, const T *grad_output, - int32_t *nram_argmax_idx, - const int32_t *argmax_idx, T *grad_input, - T *nram_grad_input, const T *nram_boxes, - const int32_t n, const int32_t c, const int32_t k, - const int32_t border, const int32_t origin_k, - const int32_t origin_n, const int32_t origin_c, - const int32_t origin_h, const int32_t origin_w, - const int32_t pool_size, const int32_t deal_num, - const int32_t deal_num_align) { - // argmax_idx, grad_output offset num - const int32_t src_offset = n * origin_k * origin_c * BORDER_NUM + - k * origin_c * BORDER_NUM + border * origin_c + c; - - // bilinear_interpolate params - int32_t x_low = 0, x_high = 0; - int32_t y_low = 0, y_high = 0; - bool empty = false; - T w1 = 0, w2 = 0, w3 = 0, w4 = 0; - - const T x_start = *(nram_boxes + border / 2 * 2); - const T y_start = *(nram_boxes + 1 + border / 2 * 2); - const T box_width = *((T *)nram_boxes + 2) - *(T *)nram_boxes; - const T box_height = *((T *)nram_boxes + 3) - *((T *)nram_boxes + 1); - T x_stride = 0; - T y_stride = 0; - switch (border) { - case 0: { // Top - x_stride = box_width / pool_size; - y_stride = 0; - } break; - case 1: { // Left - x_stride = 0; - y_stride = box_height / pool_size; - } break; - case 2: { // Bottom - x_stride = -box_width / pool_size; - y_stride = 0; - } break; - case 3: { // Right - x_stride = 0; - y_stride = -box_height / pool_size; - } break; - } - - // layer 2: loop over range[0, pool_size] - for (int32_t i = 0; i < pool_size + 1; ++i) { - const T x = x_start + x_stride * i; - const T y = y_start + y_stride * i; - bilinearInterpolate(origin_h, origin_w, y, x, &w1, &w2, &w3, &w4, &x_low, - &x_high, &y_low, &y_high, &empty); - if (!empty) { - // load argmax, - __memcpy(nram_argmax_idx, argmax_idx + src_offset, - deal_num * sizeof(int32_t), GDRAM2NRAM); // NOLINT - - /* Creat mask, mask.shape([1, deal_num]) is the same as argmax_idx - * mask[1, j] = (T)1 if (argmax_idx[1, j] == pool_idx) - * = (T)0 otherwise - */ - __bang_write_value(nram_grad_output, deal_num_align, int32_t(i)); - __bang_eq(nram_argmax_idx, nram_argmax_idx, (int32_t *)nram_grad_output, - deal_num_align); // NOLINT - if (__mluop_is_float()) { - __nram__ int32_t table[COMPUTE_COUNT_ALIGN] = {0, (int32_t)0xffffffff}; - __bang_lut_s32((int32_t *)nram_argmax_idx, (int32_t *)nram_argmax_idx, - table, deal_num_align, COMPUTE_COUNT_ALIGN); // NOLINT - } else { - __nram__ int16_t table[COMPUTE_COUNT_ALIGN] = {0, (int16_t)0xffff}; - __bang_int322int16((int16_t *)nram_argmax_idx, - (int32_t *)nram_argmax_idx, deal_num_align, 0, - 0); // NOLINT - __bang_lut_s16((int16_t *)nram_argmax_idx, (int16_t *)nram_argmax_idx, - table, deal_num_align, COMPUTE_COUNT_ALIGN); // NOLINT - } - - // load grad_output, and calculate grad_input - __memcpy(nram_grad_output, grad_output + src_offset, deal_num * sizeof(T), - GDRAM2NRAM); // NOLINT - computeGradInput(nram_grad_input, nram_grad_output, grad_input, - (T *)nram_argmax_idx, w1, w2, w3, w4, x_low, y_low, - x_high, y_high, origin_c, c, origin_w, n, origin_h, - border, deal_num, deal_num_align); - } - } -} - -template -__mlu_global__ void MLUKernelBorderAlignBackward( - const T *grad_output, const T *boxes, const int32_t *argmax_idx, - const int32_t pool_size, const int32_t origin_n, const int32_t origin_h, - const int32_t origin_w, const int32_t origin_c, const int32_t origin_k, - T *grad_input) { - // unused MPU - if (__is_mpu()) { - return; - } - - /* - * NRAM partition - * |=============|=======================| - * | Semantics | Size | - * |=============|=======================| - * | grad_output | deal_num * sizeof(T) | - * |-------------|-----------------------| - * | grad_intput | deal_num * sizeof(T) | - * |-------------|-----------------------| - * | argmax_idx | deal_num * sizeof(int)| - * |-------------|-----------------------| - * | boxes | 128byte | - * |-------------|-----------------------| - */ - const int32_t deal_num = PAD_DOWN( - (MAX_NRAM_SIZE - NFU_ALIGN_SIZE) / (2 * sizeof(T) + 1 * sizeof(int32_t)), - NFU_ALIGN_SIZE); - T *nram_boxes = (T *)nram_buffer; - T *nram_grad_output = (T *)((char *)nram_buffer + NFU_ALIGN_SIZE); - T *nram_grad_input = (T *)nram_grad_output + deal_num; - int32_t *nram_argmax_idx = (int32_t *)((T *)nram_grad_input + deal_num); - - /* - * grad_output.shape = [origin_n, origin_k, border_num, origin_c] - * boxes.shape = [origin_n, origin_k, coord_num] - * argmax_idx.shape = [origin_n, origin_k, border_num, origin_c] - * coord_num = 4; - * border_num = 4; [0:Top, 1:Left, 2:Bottom, 3:Right] - * - * Partition output: - * Split the num of boxes(origin_n * origin_k * border_num) among taskDim, - * Mulitple core load the different part of the output - * in each loop. - * - * Calculation process: - * layer 0: 0 ~ origin_n * origin_k * border_num - * layer 1: 0 ~ origin_c - * layer 2: 0 ~ pool_size - */ - const int32_t coord_num = 4; - const int32_t total_num = origin_n * origin_k * BORDER_NUM; - const int32_t num_per_core = - total_num / taskDim + int32_t((total_num % taskDim) > taskId); - - // layer 0: loop over range[0, origin_n * origin_k * border_num) - for (int32_t i = 0; i < num_per_core; ++i) { - const int32_t idx = taskId + i * taskDim; - const int32_t n = idx / origin_k / BORDER_NUM; - const int32_t k = idx / BORDER_NUM % origin_k; - const int32_t border_idx = idx % BORDER_NUM; - - /* load boxes: - * boxes[n,k,0:4] indicates the information on the bottom left - * and top right points: [lb_x, lb_y, rt_x, rt_y] - */ - __memcpy(nram_boxes, (T *)boxes + n * origin_k * coord_num + k * coord_num, - coord_num * sizeof(T), GDRAM2NRAM); - - // layer 1: loop over range[0, origin_c) - const int32_t c_repeat = origin_c / deal_num; - const int32_t c_rem = origin_c % deal_num; - for (int32_t c_seg_idx = 0; c_seg_idx < c_repeat; ++c_seg_idx) { - computeImpl((T *)nram_grad_output, (T *)grad_output, - (int32_t *)nram_argmax_idx, (int32_t *)argmax_idx, - (T *)grad_input, (T *)nram_grad_input, nram_boxes, n, - c_seg_idx * deal_num, k, border_idx, origin_k, origin_n, - origin_c, origin_h, origin_w, pool_size, deal_num, deal_num); - } - if (c_rem != 0) { - const int32_t c_rem_align = PAD_UP(c_rem, NFU_ALIGN_SIZE); - computeImpl((T *)nram_grad_output, (T *)grad_output, - (int32_t *)nram_argmax_idx, (int32_t *)argmax_idx, - (T *)grad_input, (T *)nram_grad_input, nram_boxes, n, - origin_c - c_rem, k, border_idx, origin_k, origin_n, origin_c, - origin_h, origin_w, pool_size, c_rem, c_rem_align); - } - } -} - -mluOpStatus_t MLUOP_WIN_API KernelBorderAlignBackward( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t data_type, const void *grad_output, const void *boxes, - const int32_t *argmax_idx, const int32_t pool_size, const int32_t origin_n, - const int32_t origin_h, const int32_t origin_w, const int32_t origin_c, - const int32_t origin_k, void *grad_input) { - // launch kernel - if (data_type == mluOpDataType_t::MLUOP_DTYPE_FLOAT) { - KERNEL_CHECK(MLUKernelBorderAlignBackward<<>>( - (float *)grad_output, (float *)boxes, (int32_t *)argmax_idx, pool_size, - origin_n, origin_h, origin_w, origin_c, origin_k, (float *)grad_input)); - - } else { - // half - KERNEL_CHECK(MLUKernelBorderAlignBackward<<>>( - (half *)grad_output, (half *)boxes, (int32_t *)argmax_idx, pool_size, - origin_n, origin_h, origin_w, origin_c, origin_k, (half *)grad_input)); - } - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/border_align_forward/border_align_forward.cpp b/kernels/border_align_forward/border_align_forward.cpp deleted file mode 100644 index 86af97dca..000000000 --- a/kernels/border_align_forward/border_align_forward.cpp +++ /dev/null @@ -1,136 +0,0 @@ -/******************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS self.tcp LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *******************************************************************************/ -#include "border_align_forward.h" - -#include - -#include "core/context.h" -#include "core/gen_case.h" -#include "core/logging.h" -#include "core/runtime/device.h" -#include "core/tensor.h" -#include "core/type.h" - -// policyFunc -static void policyFunc(mluOpHandle_t handle, cnrtDim3_t *k_dim, - cnrtFunctionType_t *k_type) { - *k_type = CNRT_FUNC_TYPE_UNION1; - k_dim->x = mluop::runtime::getCoreNumOfEachUnionCapability(handle); - k_dim->y = mluop::runtime::getClusterLimitCapability(handle); - k_dim->z = 1; -} - -mluOpStatus_t mluOpBorderAlignForward( - mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, - const void *input, const mluOpTensorDescriptor_t boxes_desc, - const void *boxes, const int32_t pool_size, - const mluOpTensorDescriptor_t output_desc, void *output, - const mluOpTensorDescriptor_t argmax_idx_desc, void *argmax_idx) { - const std::string API = "[mluOpBorderAlignForward]"; - PARAM_CHECK(API, handle != nullptr); - PARAM_CHECK(API, input_desc != nullptr); - PARAM_CHECK(API, boxes_desc != nullptr); - PARAM_CHECK(API, output_desc != nullptr); - PARAM_CHECK(API, argmax_idx_desc != nullptr); - - PARAM_CHECK(API, input_desc->dim == 4); - PARAM_CHECK(API, boxes_desc->dim == 3); - PARAM_CHECK(API, output_desc->dim == 4); - PARAM_CHECK(API, argmax_idx_desc->dim == 4); - - const int32_t border_num = 4; - const int32_t coord_num = 4; - const int32_t origin_n = input_desc->dims[0]; - const int32_t origin_h = input_desc->dims[1]; - const int32_t origin_w = input_desc->dims[2]; - const int32_t origin_c = input_desc->dims[3] / border_num; - const int32_t origin_k = boxes_desc->dims[1]; - - PARAM_CHECK(API, input_desc->dtype == boxes_desc->dtype); - PARAM_CHECK(API, input_desc->dtype == MLUOP_DTYPE_FLOAT || - input_desc->dtype == MLUOP_DTYPE_HALF); - PARAM_CHECK(API, boxes_desc->dtype == MLUOP_DTYPE_FLOAT || - boxes_desc->dtype == MLUOP_DTYPE_HALF); - PARAM_CHECK(API, output_desc->dtype == input_desc->dtype); - PARAM_CHECK(API, argmax_idx_desc->dtype == MLUOP_DTYPE_INT32); - - PARAM_CHECK(API, input_desc->layout == MLUOP_LAYOUT_NHWC); - PARAM_CHECK(API, output_desc->layout == MLUOP_LAYOUT_NHWC); - PARAM_CHECK(API, argmax_idx_desc->layout == MLUOP_LAYOUT_NHWC); - - PARAM_CHECK(API, input_desc->dims[3] % border_num == 0); - PARAM_CHECK_NE(API, origin_n, 0); - PARAM_CHECK_NE(API, origin_c, 0); - PARAM_CHECK_NE(API, origin_h, 0); - PARAM_CHECK_NE(API, origin_w, 0); - PARAM_CHECK_NE(API, origin_k, 0); - PARAM_CHECK(API, boxes_desc->dim == 3); - PARAM_CHECK(API, boxes_desc->dims[2] == coord_num); - - PARAM_CHECK(API, origin_n == boxes_desc->dims[0]); - PARAM_CHECK(API, origin_h * origin_w == origin_k); - PARAM_CHECK_EQ(API, output_desc->dims[0], origin_n); - PARAM_CHECK_EQ(API, output_desc->dims[1], origin_k); - PARAM_CHECK_EQ(API, output_desc->dims[2], border_num); - PARAM_CHECK_EQ(API, output_desc->dims[3], origin_c); - PARAM_CHECK_EQ(API, argmax_idx_desc->dims[0], origin_n); - PARAM_CHECK_EQ(API, argmax_idx_desc->dims[1], origin_k); - PARAM_CHECK_EQ(API, argmax_idx_desc->dims[2], border_num); - PARAM_CHECK_EQ(API, argmax_idx_desc->dims[3], origin_c); - - const size_t input_num = mluOpGetTensorElementNum(input_desc); - const size_t boxes_num = mluOpGetTensorElementNum(boxes_desc); - const size_t output_num = mluOpGetTensorElementNum(output_desc); - TENSOR_NUM_CHECK(API, input_num, LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK(API, boxes_num, LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK(API, output_num, LARGE_TENSOR_NUM, ""); - - PARAM_CHECK(API, input != nullptr); - PARAM_CHECK(API, boxes != nullptr); - PARAM_CHECK(API, output != nullptr); - PARAM_CHECK(API, argmax_idx != nullptr); - if (MLUOP_GEN_CASE_ON_NEW) { - GEN_CASE_START("border_align_forward"); - GEN_CASE_HANDLE(handle); - GEN_CASE_DATA(true, "input1", input, input_desc, 100, 0); - GEN_CASE_DATA_REAL(true, "input2", boxes, boxes_desc); - GEN_CASE_DATA(false, "output1", output, output_desc, 0, 0); - GEN_CASE_DATA(false, "output2", argmax_idx, argmax_idx_desc, 0, 0); - GEN_CASE_OP_PARAM_SINGLE(0, "border_align_forward", "pool_size", pool_size); - GEN_CASE_TEST_PARAM_NEW(false, false, true, 0.003, 0, 0); - } - - cnrtFunctionType_t k_type; - cnrtDim3_t k_dim; - policyFunc(handle, &k_dim, &k_type); - - VLOG(5) << "Launch Kernel KernelBorderAlignForward<<>>"; - CHECK_RETURN(API, KernelBorderAlignForward( - k_dim, k_type, handle->queue, input_desc->dtype, input, - boxes, pool_size, origin_n, origin_h, origin_w, - origin_c, origin_k, output, (int32_t *)argmax_idx)); - GEN_CASE_END(); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/border_align_forward/border_align_forward.h b/kernels/border_align_forward/border_align_forward.h deleted file mode 100644 index a7e146dd3..000000000 --- a/kernels/border_align_forward/border_align_forward.h +++ /dev/null @@ -1,37 +0,0 @@ -/******************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS self.tcp LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *******************************************************************************/ -#ifndef KERNELS_BORDER_ALIGN_FORWARD_BORDER_ALIGN_FORWARD_H_ -#define KERNELS_BORDER_ALIGN_FORWARD_BORDER_ALIGN_FORWARD_H_ - -#include "mlu_op.h" -#include "kernels/debug.h" -#include "kernels/kernel.h" - -mluOpStatus_t MLUOP_WIN_API KernelBorderAlignForward( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t d_type, const void *input, const void *boxes, - const int32_t pool_size, const int32_t origin_n, const int32_t origin_h, - const int32_t origin_w, const int32_t origin_c, const int32_t origin_k, - void *output, int32_t *argmax_idx_nram); - -#endif // KERNELS_BORDER_ALIGN_FORWARD_BORDER_ALIGN_FORWARD_H_ diff --git a/kernels/border_align_forward/border_align_forward_union1.mlu b/kernels/border_align_forward/border_align_forward_union1.mlu deleted file mode 100644 index b18449b6f..000000000 --- a/kernels/border_align_forward/border_align_forward_union1.mlu +++ /dev/null @@ -1,413 +0,0 @@ -/******************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS self.tcp LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *******************************************************************************/ -#include "border_align_forward.h" - -#include "core/logging.h" -#include "kernels/debug.h" -#include "kernels/kernel.h" -#include "kernels/utils/common.h" - -#define BORDER_NUM 4 - -__nram__ char nram_buffer[MAX_NRAM_SIZE]; - -template -__mlu_func__ void bilinearInterpolate(const int32_t input_height, - const int32_t input_width, T x, T y, - T *w1, T *w2, T *w3, T *w4, - int32_t *x_low, int32_t *x_high, - int32_t *y_low, int32_t *y_high, - bool *empty) { - // deal with case that the point is out of feature map boundary - // https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp#L29 - if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) { - *empty = true; - return; - } - *empty = false; - if (y <= 0) y = (T)0; - if (x <= 0) x = (T)0; - - *y_low = int32_t(y); - *x_low = int32_t(x); - - if (*y_low >= input_height - 1) { - *y_high = *y_low = input_height - 1; - y = (T)(*y_low); - } else { - *y_high = *y_low + 1; - } - - if (*x_low >= input_width - 1) { - *x_high = *x_low = input_width - 1; - x = T(*x_low); - } else { - *x_high = *x_low + 1; - } - T ly = y - *y_low; - T lx = x - *x_low; - T hy = 1. - ly; - T hx = 1. - lx; - *w1 = hy * hx; - *w2 = hy * lx; - *w3 = ly * hx; - *w4 = ly * lx; -} - -template -__mlu_func__ void getBilinearInterpolateResult(T *input_ping_nram, const T &w1, - const T &w2, const T &w3, - const T &w4, - const int32_t &deal_num) { - /* do bilinear interpolation: - * value = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4 - * st. v1 = HW[y_low, x_low] - * v2 = HW[y_low, x_high] - * v3 = HW[y_high, x_low] - * v4 = HW[y_high, x_high] - */ - T *v1 = input_ping_nram; - T *v2 = input_ping_nram + 1 * deal_num; - T *v3 = input_ping_nram + 2 * deal_num; - T *v4 = input_ping_nram + 3 * deal_num; - - __bang_mul_scalar(v1, v1, w1, deal_num); - __bang_fusion(FUSION_FMA, v2, v2, w2, v1, deal_num, deal_num); - __bang_fusion(FUSION_FMA, v3, v3, w3, v2, deal_num, deal_num); - __bang_fusion(FUSION_FMA, v1, v4, w4, v3, deal_num, deal_num); -} - -template -__mlu_func__ void computeMaxPoolAndArgmaxIdx(int32_t *argmax_idx_nram, - T *output_nram, T *input_ping_nram, - const int32_t &pool_idx, - const int32_t &deal_num) { - if (pool_idx == 0) { - __bang_move(output_nram, input_ping_nram, deal_num * sizeof(T)); - return; - } - int32_t *temp = (int32_t *)input_ping_nram; - int32_t *temp1 = temp + deal_num; - __bang_lt((T *)temp1, output_nram, input_ping_nram, deal_num); - - // 1. output = max(value, output) - __bang_maxequal(output_nram, output_nram, input_ping_nram, deal_num); - - // 2. update argmax_idx - // 2.1 argmax_idx *= (output >= value) - // 2.2 argmax_idx += pool_idx * (output < value) - if (__mluop_is_float()) { - __bang_float2int32_rd(temp, (float *)temp1, deal_num, 0); - } else { - __bang_half2int32_rd(temp, (half *)temp1, deal_num, 0); - } - - __bang_not(temp1, temp, deal_num); // 2.1 - __bang_mul(argmax_idx_nram, argmax_idx_nram, temp1, deal_num); - __bang_mul_scalar(temp, temp, pool_idx, deal_num); // 2.2 - __bang_add(argmax_idx_nram, argmax_idx_nram, temp, deal_num); -} - -template -__mlu_func__ void pipeline(T *input_ping_nram, const T *input, T *boxes_nram, - int32_t *argmax_idx_nram, T *base_output, - int32_t *base_argmax_idx, T *output_nram, - const int32_t n, const int32_t c_offset, - const int32_t origin_k, const int32_t origin_h, - const int32_t origin_w, const int32_t origin_c, - const int32_t pool_size, T x, T y, const T x_stride, - const T y_stride, const int32_t border, - const int32_t pingpong_gap, const int32_t deal_num) { - // init params of bilinear-interpolate - int32_t x_low = 0, x_high = 0; - int32_t y_low = 0, y_high = 0; - T w1 = 0, w2 = 0, w3 = 0, w4 = 0; - bool empty = false; - bilinearInterpolate(origin_h, origin_w, x, y, &w1, &w2, &w3, &w4, &x_low, - &x_high, &y_low, &y_high, &empty); - - /* - * Pipeline: - * The pipeline is processed in three stages: Load, Compute, - * Store. The allocated memory space of NRAM is divided into - * two parts: PING and Pong. In one time step, PING and PONG - * works on different stream built in chip. For example, while - * PING is loading data from GDRAM, PONG is computing data - * from last time step, or in turn. Both of them are processed - * synchronously until finished. - * - * diagram of PINGPONG: - * |------|-----------------------------------------------------| - * | | space | - * |------|-----------------------------------------------------| - * | time | Ping | Pong | Ping | ... | Pong | - * |------|-----------------------------------------------------| - * | 0 | L0 | | | | | - * | 1 | C0 | L1 | | | | - * | 2 | | C1 | L2 | | | - * | 3 | | | C2 | ... | | - * | . | | | | ... | | - * | . | | | | ... | L_end | - * | . | | | | | C_end | - * | . | | | | | S | - * |------|-----------------------------------------------------| - */ - -#define LOAD_INPUT(dst, src, h, w, idx) \ - const int32_t src_offset_##idx = \ - ((n * origin_h + h) * origin_w + w) * BORDER_NUM * origin_c + \ - border * origin_c + c_offset; \ - __memcpy_async(dst + idx * deal_num_align, src + src_offset_##idx, \ - deal_num * sizeof(T), GDRAM2NRAM); - - // L0 - const int32_t deal_num_align = PAD_UP(deal_num, NFU_ALIGN_SIZE); - __bang_write_value(argmax_idx_nram, deal_num_align, (int32_t)0); - if (!empty) { - LOAD_INPUT((T *)input_ping_nram, (T *)input, y_low, x_low, 0); - LOAD_INPUT((T *)input_ping_nram, (T *)input, y_low, x_high, 1); - LOAD_INPUT((T *)input_ping_nram, (T *)input, y_high, x_low, 2); - LOAD_INPUT((T *)input_ping_nram, (T *)input, y_high, x_high, 3); - } else { - __memset_nram(input_ping_nram, pingpong_gap, (T)0); - } - __sync(); - - T w1_previous = w1; - T w2_previous = w2; - T w3_previous = w3; - T w4_previous = w4; - bool empty_previous = empty; - - x += x_stride; - y += y_stride; - bilinearInterpolate(origin_h, origin_w, x, y, &w1, &w2, &w3, &w4, &x_low, - &x_high, &y_low, &y_high, &empty); - - // layer 3: loop over range[0, pool_size) - for (int32_t i = 0; i < pool_size; ++i) { - /**** Load ****/ - T *input_nram_load = input_ping_nram + int32_t((i + 1) % 2) * pingpong_gap; - if (!empty) { - LOAD_INPUT((T *)input_nram_load, (T *)input, y_low, x_low, 0); - LOAD_INPUT((T *)input_nram_load, (T *)input, y_low, x_high, 1); - LOAD_INPUT((T *)input_nram_load, (T *)input, y_high, x_low, 2); - LOAD_INPUT((T *)input_nram_load, (T *)input, y_high, x_high, 3); - } - - /**** Compute ****/ - T *input_nram_compute = input_ping_nram + int32_t(i % 2) * pingpong_gap; - if (!empty_previous) { - // value = 0 point outside of the box - // = sum(w[j] * v[j]), j=1,2,3,4 otherwise - getBilinearInterpolateResult(input_nram_compute, w1_previous, w2_previous, - w3_previous, w4_previous, deal_num_align); - } else { - __bang_write_value(input_nram_compute, deal_num_align, (T)0); - } - computeMaxPoolAndArgmaxIdx(argmax_idx_nram, output_nram, input_nram_compute, - i, deal_num_align); - { - // update x,y and store previous-value - w1_previous = w1; - w2_previous = w2; - w3_previous = w3; - w4_previous = w4; - empty_previous = empty; - - x += x_stride; - y += y_stride; - bilinearInterpolate(origin_h, origin_w, x, y, &w1, &w2, &w3, &w4, &x_low, - &x_high, &y_low, &y_high, &empty); - } - __sync(); - } - - // C_end - if (!empty_previous) { - getBilinearInterpolateResult( - input_ping_nram + int32_t((pool_size) % 2) * pingpong_gap, w1_previous, - w2_previous, w3_previous, w4_previous, deal_num_align); - } else { - __bang_write_value(input_ping_nram + int32_t(pool_size % 2) * pingpong_gap, - deal_num_align, (T)0); - } - computeMaxPoolAndArgmaxIdx( - argmax_idx_nram, output_nram, - input_ping_nram + int32_t(pool_size % 2) * pingpong_gap, pool_size, - deal_num_align); - - // S - __memcpy(base_output + c_offset, output_nram, deal_num * sizeof(T), - NRAM2GDRAM); // NOLINT - __memcpy(base_argmax_idx + c_offset, argmax_idx_nram, - deal_num * sizeof(int32_t), NRAM2GDRAM); // NOLINT -} - -template -__mlu_global__ void MLUKernelBorderAlignForward( - const T *input, const T *boxes, const int32_t pool_size, - const int32_t origin_n, const int32_t origin_h, const int32_t origin_w, - const int32_t origin_c, const int32_t origin_k, T *output, - int32_t *argmax_idx) { - // unused MPU - if (__is_mpu()) { - return; - } - - /* - * NRAM partition - * |--------------------------------------------------------| - * | Semantics | NRAM | - * |------------|-------------------------------------------| - * | PING | input_lt | input_lb | input_rt | input_rb | - * |------------|----------|----------|----------|----------| - * | PONG | input_lt | input_lb | input_rt | input_rb | - * |------------|----------|----------|----------|----------| - * | Other | output |argmax_idx| boxes | - * |---------------------------------------------| - * - * MAX_NRAM_SIZE = - * PING {4 * deal_num * sizeof(T)} + - * PONG {4 * deal_num * sizeof(T)} + - * Other{ deal_num * sizeof(T) + - * deal_num * sizeof(int32_t) + 128byte} - */ - const int32_t pingpong_split_num = 4 + 4; - const int32_t deal_num = - PAD_DOWN(((MAX_NRAM_SIZE - NFU_ALIGN_SIZE) / - (pingpong_split_num * sizeof(T) + sizeof(T) + sizeof(int32_t))), - NFU_ALIGN_SIZE); - const int32_t pingpong_gap = 4 * deal_num; - - T *input_ping_nram = (T *)nram_buffer; - T *output_nram = input_ping_nram + pingpong_split_num * deal_num; - T *boxes_nram = output_nram + deal_num; - int32_t *argmax_idx_nram = (int32_t *)((char *)boxes_nram + NFU_ALIGN_SIZE); - - /* - * input.shape = [origin_n, origin_h, origin_w, border_num * origin_c] - * boxes.shape = [origin_n, origin_k, coord_num] - * output.shape = [origin_n, origin_k, border_num, origin_c] - * argmax_idx.shape = [origin_n, origin_k, border_num, origin_c] - * coord_num = 4; - * border_num = 4; - * - * Partition output: - * Split the num of boxes(origin_n * origin_k) among taskDim, Mulitple - * core load the different part of the output in each loop. - * - * Calculation process: - * |—— layer 0: 0 ~ origin_n * origin_k - * |————— layer 1: 0 ~ border_num - * |———————— layer 2: 0 ~ origin_c - * |——————————— layer 3: 0 ~ pool_size - */ - const int32_t coord_num = 4; - const int32_t boxes_num = origin_n * origin_k; - const int32_t boxes_num_per_core = - boxes_num / taskDim + int32_t((boxes_num % taskDim) > taskId); - - // layer 0: loop over range[0, boxes_num_per_core) - for (int32_t i = 0; i < boxes_num_per_core; ++i) { - /* load boxes: - * boxes[n,k,0:4] indicates the information on the bottom left - * and top right points: [lb_x, lb_y, rt_x, rt_y] - */ - const int32_t nk_offset = taskId + i * taskDim; - __memcpy(boxes_nram, (T *)boxes + nk_offset * coord_num, - coord_num * sizeof(T), GDRAM2NRAM); - const T box_width = boxes_nram[2] - boxes_nram[0]; - const T box_height = boxes_nram[3] - boxes_nram[1]; - T x_stride = 0; - T y_stride = 0; - - // layer 1: loop over [0:Top, 1:Left, 2:Bottom, 3:Right] - for (int32_t border = 0; border < BORDER_NUM; ++border) { - switch (border) { - case 0: { // Top - x_stride = box_width / pool_size; - y_stride = 0; - } break; - case 1: { // Left - x_stride = 0; - y_stride = box_height / pool_size; - } break; - case 2: { // Bottom - x_stride = -box_width / pool_size; - y_stride = 0; - } break; - case 3: { // Right - x_stride = 0; - y_stride = -box_height / pool_size; - } break; - } - T x = *(boxes_nram + border / 2 * 2); - T y = *(boxes_nram + border / 2 * 2 + 1); - - // gdram_ptr of ouput,argmax_idx - T *base_output = - output + nk_offset * BORDER_NUM * origin_c + border * origin_c; - int32_t *base_argmax_idx = - argmax_idx + nk_offset * BORDER_NUM * origin_c + border * origin_c; - - // layer 2: loop over range[0, origin_c) - const int32_t c_repeat = origin_c / deal_num; - const int32_t c_rem = origin_c % deal_num; - for (int32_t c_seg_idx = 0; c_seg_idx < c_repeat; ++c_seg_idx) { - pipeline(input_ping_nram, input, boxes_nram, argmax_idx_nram, - base_output, base_argmax_idx, output_nram, - nk_offset / origin_k, c_seg_idx * deal_num, origin_k, - origin_h, origin_w, origin_c, pool_size, x, y, x_stride, - y_stride, border, pingpong_gap, deal_num); - } - if (c_rem != 0) { - pipeline(input_ping_nram, input, boxes_nram, argmax_idx_nram, - base_output, base_argmax_idx, output_nram, - nk_offset / origin_k, origin_c - c_rem, origin_k, origin_h, - origin_w, origin_c, pool_size, x, y, x_stride, y_stride, - border, pingpong_gap, c_rem); - } - } - } -} - -mluOpStatus_t MLUOP_WIN_API KernelBorderAlignForward( - const cnrtDim3_t k_dim, const cnrtFunctionType_t k_type, - const cnrtQueue_t queue, mluOpDataType_t data_type, const void *input, - const void *boxes, const int32_t pool_size, const int32_t origin_n, - const int32_t origin_h, const int32_t origin_w, const int32_t origin_c, - const int32_t origin_k, void *output, int32_t *argmax_idx_nram) { - // launch kernel - if (data_type == mluOpDataType_t::MLUOP_DTYPE_FLOAT) { - KERNEL_CHECK(MLUKernelBorderAlignForward<<>>( - (float *)input, (float *)boxes, pool_size, origin_n, origin_h, origin_w, - origin_c, origin_k, (float *)output, (int32_t *)argmax_idx_nram)); - } else { - // half - KERNEL_CHECK(MLUKernelBorderAlignForward<<>>( - (half *)input, (half *)boxes, pool_size, origin_n, origin_h, origin_w, - origin_c, origin_k, (half *)output, (int32_t *)argmax_idx_nram)); - } - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/dcn_backward_data/dcn_backward_data.cpp b/kernels/dcn_backward_data/dcn_backward_data.cpp deleted file mode 100755 index aa20bb224..000000000 --- a/kernels/dcn_backward_data/dcn_backward_data.cpp +++ /dev/null @@ -1,136 +0,0 @@ -/************************************************************************* - * Copyright (C) [2024] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include -#include -#include - -#include "kernels/utils/cnnl_helper.h" - -#define DCNBPDATA_API "mluOpDCNBackwardData" - -mluOpStatus_t MLUOP_WIN_API mluOpGetDCNBakcwardDataWorkspaceSize( - mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc, - const mluOpTensorDescriptor_t input_desc, - const mluOpTensorDescriptor_t offset_desc, - const mluOpTensorDescriptor_t mask_desc, - const mluOpTensorDescriptor_t filter_desc, - const mluOpTensorDescriptor_t grad_output_desc, - const mluOpTensorDescriptor_t grad_input_desc, - const mluOpTensorDescriptor_t grad_offset_desc, - const mluOpTensorDescriptor_t grad_mask_desc, size_t *workspace_size) { - PARAM_CHECK(DCNBPDATA_API, handle != NULL); - PARAM_CHECK(DCNBPDATA_API, dcn_desc != NULL); - PARAM_CHECK(DCNBPDATA_API, input_desc != NULL); - PARAM_CHECK(DCNBPDATA_API, offset_desc != NULL); - PARAM_CHECK(DCNBPDATA_API, filter_desc != NULL); - PARAM_CHECK(DCNBPDATA_API, dcn_desc != NULL); - PARAM_CHECK(DCNBPDATA_API, grad_output_desc != NULL); - PARAM_CHECK(DCNBPDATA_API, grad_input_desc != NULL); - PARAM_CHECK(DCNBPDATA_API, grad_offset_desc != NULL); - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(offset_desc, cnnl_offset_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mask_desc, cnnl_mask_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filter_desc, cnnl_filter_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_output_desc, - cnnl_grad_output_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_input_desc, - cnnl_grad_input_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_offset_desc, - cnnl_grad_offset_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_mask_desc, - cnnl_grad_mask_desc); - - CHECK_FUNC_RETURN( - cnnlGetDCNBakcwardDataWorkspaceSize( - cnnl_handle, dcn_desc, cnnl_input_desc, cnnl_offset_desc, - cnnl_mask_desc, cnnl_filter_desc, cnnl_grad_output_desc, - cnnl_grad_input_desc, cnnl_grad_offset_desc, cnnl_grad_mask_desc, - workspace_size), - CNNL_STATUS_SUCCESS, - "[mluOpGetDCNBakcwardDataWorkspaceSize] Internal error accured in " - "cnnlGetDCNBakcwardDataWorkspaceSize.", - MLUOP_STATUS_INTERNAL_ERROR); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_offset_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mask_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_filter_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_output_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_input_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_offset_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_mask_desc); - - DESTROY_CNNL_HANDLE(cnnl_handle); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpDCNBackwardData( - mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc, - const mluOpTensorDescriptor_t input_desc, const void *input, - const mluOpTensorDescriptor_t offset_desc, const void *offset, - const mluOpTensorDescriptor_t mask_desc, const void *mask, - const mluOpTensorDescriptor_t filter_desc, const void *filter, - const mluOpTensorDescriptor_t grad_output_desc, const void *grad_output, - void *workspace, const size_t workspace_size, - const mluOpTensorDescriptor_t grad_input_desc, void *grad_input, - const mluOpTensorDescriptor_t grad_offset_desc, void *grad_offset, - const mluOpTensorDescriptor_t grad_mask_desc, void *grad_mask) { - PARAM_CHECK(DCNBPDATA_API, handle != NULL); - if (workspace_size > 0) { - PARAM_CHECK(DCNBPDATA_API, workspace != NULL); - } - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(offset_desc, cnnl_offset_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mask_desc, cnnl_mask_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filter_desc, cnnl_filter_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_output_desc, - cnnl_grad_output_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_input_desc, - cnnl_grad_input_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_offset_desc, - cnnl_grad_offset_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_mask_desc, - cnnl_grad_mask_desc); - CHECK_FUNC_RETURN( - cnnlDCNBackwardData( - cnnl_handle, dcn_desc, cnnl_input_desc, input, cnnl_offset_desc, - offset, cnnl_mask_desc, mask, cnnl_filter_desc, filter, - cnnl_grad_output_desc, grad_output, workspace, workspace_size, - cnnl_grad_input_desc, grad_input, cnnl_grad_offset_desc, grad_offset, - cnnl_grad_mask_desc, grad_mask), - CNNL_STATUS_SUCCESS, - "[mluOpDcnBackwardData] Internal error accured in cnnlDCNBackwardData.", - MLUOP_STATUS_INTERNAL_ERROR); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_offset_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mask_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_filter_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_output_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_input_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_offset_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_mask_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/dcn_backward_weight/dcn_backward_weight.cpp b/kernels/dcn_backward_weight/dcn_backward_weight.cpp deleted file mode 100644 index 0f9bcb094..000000000 --- a/kernels/dcn_backward_weight/dcn_backward_weight.cpp +++ /dev/null @@ -1,109 +0,0 @@ -/************************************************************************* - * Copyright (C) [2024] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include -#include -#include - -#include "kernels/utils/cnnl_helper.h" - -#define DCNBACKWARDWEIGHT_API "mluOpDCNBackwardWeight" - -mluOpStatus_t MLUOP_WIN_API mluOpGetDCNBackwardWeightWorkspaceSize( - mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc, - const mluOpTensorDescriptor_t input_desc, - const mluOpTensorDescriptor_t offset_desc, - const mluOpTensorDescriptor_t mask_desc, - const mluOpTensorDescriptor_t grad_output_desc, - const mluOpTensorDescriptor_t grad_filter_desc, - const mluOpTensorDescriptor_t grad_bias_desc, size_t *size) { - PARAM_CHECK("mluOpDCNBackwardWeight", handle != NULL); - PARAM_CHECK("mluOpDCNBackwardWeight", dcn_desc != NULL); - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, _handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, _input_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(offset_desc, _offset_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mask_desc, _mask_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_output_desc, - _grad_output_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_filter_desc, - _grad_filter_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_bias_desc, _grad_bias_desc); - CHECK_FUNC_RETURN( - cnnlGetDCNBackwardWeightWorkspaceSize( - _handle, dcn_desc, _input_desc, _offset_desc, _mask_desc, - _grad_output_desc, _grad_filter_desc, _grad_bias_desc, size), - CNNL_STATUS_SUCCESS, - "[mluOpDCNBackwardWeight] Internal error accured in " - "mluOpGetDCNBackwardWeightWorkspaceSize.", // NOLINT - MLUOP_STATUS_INTERNAL_ERROR); - DESTROY_CNNL_TENSOR_DESCRIPTOR(_input_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(_offset_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(_mask_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(_grad_output_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(_grad_filter_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(_grad_bias_desc); - DESTROY_CNNL_HANDLE(_handle); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpDCNBackwardWeight( - mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc, - const mluOpTensorDescriptor_t input_desc, const void *input, - const mluOpTensorDescriptor_t offset_desc, const void *offset, - const mluOpTensorDescriptor_t mask_desc, const void *mask, - const mluOpTensorDescriptor_t grad_output_desc, const void *grad_output, - void *workspace, const size_t workspace_size, - const mluOpTensorDescriptor_t grad_filter_desc, void *grad_filter, - const mluOpTensorDescriptor_t grad_bias_desc, void *grad_bias) { - PARAM_CHECK(DCNBACKWARDWEIGHT_API, handle != NULL); - if (workspace_size > 0) { - PARAM_CHECK(DCNBACKWARDWEIGHT_API, workspace != NULL); - } - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(offset_desc, cnnl_offset_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mask_desc, cnnl_mask_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_output_desc, - cnnl_grad_output_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_filter_desc, - cnnl_grad_filter_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_bias_desc, - cnnl_grad_bias_desc); - CHECK_FUNC_RETURN( - cnnlDCNBackwardWeight(cnnl_handle, dcn_desc, cnnl_input_desc, input, - cnnl_offset_desc, offset, cnnl_mask_desc, mask, - cnnl_grad_output_desc, grad_output, workspace, - workspace_size, cnnl_grad_filter_desc, grad_filter, - cnnl_grad_bias_desc, grad_bias), - CNNL_STATUS_SUCCESS, - "[mluOpDcnBackwardWeight] Internal error accured in " - "mluOpDcnBackwardWeight.", - MLUOP_STATUS_INTERNAL_ERROR); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_offset_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mask_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_output_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_filter_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_bias_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/dcn_forward/dcn_common.h b/kernels/dcn_forward/dcn_common.h deleted file mode 100644 index 59acab57a..000000000 --- a/kernels/dcn_forward/dcn_common.h +++ /dev/null @@ -1,69 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#ifndef KERNELS_DCN_COMMON_DCN_COMMON_H -#define KERNELS_DCN_COMMON_DCN_COMMON_H -#include -#include -#include - -#include "kernels/utils/cnnl_helper.h" - -#define DCN_API "mluOpDCN" - -mluOpStatus_t MLUOP_WIN_API -mluOpCreateDCNDescriptor(mluOpDCNDescriptor_t *dcn_desc) { - PARAM_CHECK(DCN_API, dcn_desc != NULL); - CHECK_FUNC_RETURN(cnnlCreateDCNDescriptor(dcn_desc), CNNL_STATUS_SUCCESS, - "[mluOpDcn] Internal error accured in " - "mluOpCreateDCNDescriptor.", - MLUOP_STATUS_INTERNAL_ERROR); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API -mluOpDestroyDCNDescriptor(mluOpDCNDescriptor_t dcn_desc) { - PARAM_CHECK(DCN_API, dcn_desc != NULL); - CHECK_FUNC_RETURN(cnnlDestroyDCNDescriptor(dcn_desc), CNNL_STATUS_SUCCESS, - "[mluOpDcn] Internal error accured in " - "mluOpDestroyDCNDescriptor.", - MLUOP_STATUS_INTERNAL_ERROR); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpSetDCNDescriptor( - mluOpDCNDescriptor_t dcn_desc, int dimNb, const int pad[], - const int stride[], const int dilation[], int deformable_group, - int conv_group, int im2col_step, const mluOpDataType_t compute_type) { - PARAM_CHECK(DCN_API, dcn_desc != NULL); - CHECK_FUNC_RETURN( - cnnlSetDCNDescriptor(dcn_desc, dimNb, pad, stride, dilation, - deformable_group, conv_group, im2col_step, - cnnlDataType_t(compute_type)), - CNNL_STATUS_SUCCESS, - "[mluOpDcn] Internal error accured in " - "mluOpSetDCNDescriptor.", - MLUOP_STATUS_INTERNAL_ERROR); - return MLUOP_STATUS_SUCCESS; -} - -#endif // KERNELS_DCN_COMMON_DCN_COMMON_H diff --git a/kernels/dcn_forward/dcn_forward.cpp b/kernels/dcn_forward/dcn_forward.cpp deleted file mode 100644 index c746f8971..000000000 --- a/kernels/dcn_forward/dcn_forward.cpp +++ /dev/null @@ -1,103 +0,0 @@ -/************************************************************************* - * Copyright (C) [2024] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "kernels/dcn_forward/dcn_common.h" - -#define DCNFORWARD_API "mluOpDCNForward" - -mluOpStatus_t MLUOP_WIN_API mluOpGetDCNForwardWorkspaceSize( - mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc, - const mluOpTensorDescriptor_t input_desc, - const mluOpTensorDescriptor_t offset_desc, - const mluOpTensorDescriptor_t mask_desc, - const mluOpTensorDescriptor_t filter_desc, - const mluOpTensorDescriptor_t bias_desc, - const mluOpTensorDescriptor_t output_desc, size_t *size) { - PARAM_CHECK("mluOpDCNForward", handle != NULL); - PARAM_CHECK("mluOpDCNForward", dcn_desc != NULL); - PARAM_CHECK("mluOpDCNForward", input_desc != NULL); - PARAM_CHECK("mluOpDCNForward", offset_desc != NULL); - PARAM_CHECK("mluOpDCNForward", filter_desc != NULL); - PARAM_CHECK("mluOpDCNForward", output_desc != NULL); - PARAM_CHECK("mluOpDCNForward", size != NULL); - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(offset_desc, cnnl_offset_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mask_desc, cnnl_mask_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filter_desc, cnnl_filter_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(bias_desc, cnnl_bias_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_desc, cnnl_output_desc); - CHECK_FUNC_RETURN(cnnlGetDCNForwardWorkspaceSize( - cnnl_handle, dcn_desc, cnnl_input_desc, - cnnl_offset_desc, cnnl_mask_desc, cnnl_filter_desc, - cnnl_bias_desc, cnnl_output_desc, size), - CNNL_STATUS_SUCCESS, - "[mluOpDCNForward] Internal error accured in " - "mluOpGetDCNForwardWorkspaceSize.", // NOLINT - MLUOP_STATUS_INTERNAL_ERROR); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_offset_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mask_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_filter_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_bias_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API -mluOpDCNForward(mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc, - const mluOpTensorDescriptor_t input_desc, const void *input, - const mluOpTensorDescriptor_t offset_desc, const void *offset, - const mluOpTensorDescriptor_t mask_desc, const void *mask, - const mluOpTensorDescriptor_t filter_desc, const void *filter, - const mluOpTensorDescriptor_t bias_desc, const void *bias, - void *workspace, size_t workspace_size, - const mluOpTensorDescriptor_t output_desc, void *output) { - PARAM_CHECK(DCNFORWARD_API, handle != NULL); - if (workspace_size > 0) { - PARAM_CHECK(DCNFORWARD_API, workspace != NULL); - } - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(offset_desc, cnnl_offset_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mask_desc, cnnl_mask_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filter_desc, cnnl_filter_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(bias_desc, cnnl_bias_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_desc, cnnl_output_desc); - CHECK_FUNC_RETURN( - cnnlDCNForward(cnnl_handle, dcn_desc, cnnl_input_desc, input, - cnnl_offset_desc, offset, cnnl_mask_desc, mask, - cnnl_filter_desc, filter, cnnl_bias_desc, bias, workspace, - workspace_size, cnnl_output_desc, output), - CNNL_STATUS_SUCCESS, - "[mluOpDcnForward] Internal error accured in mluOpDcnForward.", - MLUOP_STATUS_INTERNAL_ERROR); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_offset_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mask_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_filter_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_bias_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.cpp b/kernels/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.cpp deleted file mode 100644 index 5b316f2f1..000000000 --- a/kernels/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.cpp +++ /dev/null @@ -1,377 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "dynamic_point_to_voxel_backward.h" - -#include // std::min -#include - -#include "core/gen_case.h" -#include "core/logging.h" -#include "core/runtime/device.h" -#include "core/tensor.h" -#include "core/type.h" // mluop::getSizeOfDataType -#include "kernels/kernel.h" -#include "kernels/utils/cnnl_helper.h" - -static mluOpStatus_t DynamicPointToVoxelBackwardParamCheck( - const char *interface_name, const mluOpHandle_t handle, - const mluOpReduceMode_t reduce_type, - const mluOpTensorDescriptor_t grad_voxel_feats_desc, - const void *grad_voxel_feats, const mluOpTensorDescriptor_t feats_desc, - const void *feats, const mluOpTensorDescriptor_t voxel_feats_desc, - const void *voxel_feats, const mluOpTensorDescriptor_t point2voxel_map_desc, - const void *point2voxel_map, - const mluOpTensorDescriptor_t voxel_points_count_desc, - const void *voxel_points_count, - const mluOpTensorDescriptor_t voxel_num_desc, const void *voxel_num, - void *workspace, const size_t workspace_size, - const mluOpTensorDescriptor_t grad_feats_desc, void *grad_feats, - bool &zero_element) { - // check handle - PARAM_CHECK(interface_name, handle != NULL); - // platform check - if (handle->arch < MLUOP_MLU370) { - LOG(ERROR) << interface_name - << "Only mlu300 and above devices are supported. " - << "Please check the device version!"; - return MLUOP_STATUS_ARCH_MISMATCH; - } - // check desc - PARAM_CHECK(interface_name, grad_voxel_feats_desc != NULL); - PARAM_CHECK(interface_name, feats_desc != NULL); - PARAM_CHECK(interface_name, voxel_feats_desc != NULL); - PARAM_CHECK(interface_name, point2voxel_map_desc != NULL); - PARAM_CHECK(interface_name, voxel_points_count_desc != NULL); - PARAM_CHECK(interface_name, voxel_num_desc != NULL); - PARAM_CHECK(interface_name, grad_feats_desc != NULL); - - // check data type - PARAM_CHECK(interface_name, - grad_voxel_feats_desc->dtype == MLUOP_DTYPE_FLOAT); - PARAM_CHECK(interface_name, feats_desc->dtype == MLUOP_DTYPE_FLOAT); - PARAM_CHECK(interface_name, voxel_feats_desc->dtype == MLUOP_DTYPE_FLOAT); - PARAM_CHECK(interface_name, grad_feats_desc->dtype == MLUOP_DTYPE_FLOAT); - - PARAM_CHECK(interface_name, point2voxel_map_desc->dtype == MLUOP_DTYPE_INT32); - PARAM_CHECK(interface_name, - voxel_points_count_desc->dtype == MLUOP_DTYPE_INT32); - PARAM_CHECK(interface_name, voxel_num_desc->dtype == MLUOP_DTYPE_INT32); - - // check shape - PARAM_CHECK(interface_name, grad_voxel_feats_desc->dim == 2); - PARAM_CHECK(interface_name, feats_desc->dim == 2); - PARAM_CHECK(interface_name, voxel_feats_desc->dim == 2); - PARAM_CHECK(interface_name, point2voxel_map_desc->dim == 1); - PARAM_CHECK(interface_name, voxel_points_count_desc->dim == 1); - PARAM_CHECK(interface_name, voxel_num_desc->dim == 1); - PARAM_CHECK(interface_name, grad_feats_desc->dim == 2); - - PARAM_CHECK(interface_name, - feats_desc->dims[1] == grad_voxel_feats_desc->dims[1]); - PARAM_CHECK(interface_name, - voxel_feats_desc->dims[0] == grad_voxel_feats_desc->dims[0]); - PARAM_CHECK(interface_name, - voxel_feats_desc->dims[1] == grad_voxel_feats_desc->dims[1]); - PARAM_CHECK(interface_name, - point2voxel_map_desc->dims[0] == feats_desc->dims[0]); - PARAM_CHECK(interface_name, voxel_points_count_desc->dims[0] == - grad_voxel_feats_desc->dims[0]); - PARAM_CHECK(interface_name, voxel_num_desc->dims[0] == 1); - PARAM_CHECK(interface_name, grad_feats_desc->dims[0] == feats_desc->dims[0]); - PARAM_CHECK(interface_name, - grad_feats_desc->dims[1] == grad_voxel_feats_desc->dims[1]); - PARAM_CHECK(interface_name, - feats_desc->dims[0] >= grad_voxel_feats_desc->dims[0]); - - // param check - if (reduce_type != MLUOP_REDUCE_DMAX) { - LOG(ERROR) << interface_name - << " only supports max reduce in current version. "; - return MLUOP_STATUS_BAD_PARAM; - } - - // large tensor - const uint64_t grad_voxel_feats_element_num = - mluOpGetTensorElementNum(grad_voxel_feats_desc); - const uint64_t feats_element_num = mluOpGetTensorElementNum(feats_desc); - TENSOR_NUM_CHECK(interface_name, grad_voxel_feats_element_num, - LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK(interface_name, feats_element_num, LARGE_TENSOR_NUM, ""); - - // kernel size check - const int N = feats_desc->dims[0]; - const int C = feats_desc->dims[1]; - const size_t dtype_bytes = mluop::getSizeOfDataType(feats_desc->dtype); - const size_t idx_dtype_bytes = - mluop::getSizeOfDataType(point2voxel_map_desc->dtype); - if (N * (idx_dtype_bytes + 1) + C * (2 * dtype_bytes + 3 * idx_dtype_bytes) + - idx_dtype_bytes > - handle->nram_size) { - // float + int - LOG(ERROR) - << interface_name - << " The feats dtype is float, point2voxel_map dtype is int. The feats " - "shape is [" - << N << ", " << C << "]" - << ", should meet constraint : " - "5*feats_desc->dims[0]+20*feats_desc->dims[1]+sizeof(int) <= " - << handle->nram_size; - return MLUOP_STATUS_BAD_PARAM; - } - - // 0-element check, after dim and shape check - if (mluOpGetTensorElementNum(grad_feats_desc) == 0) { - zero_element = true; - return MLUOP_STATUS_SUCCESS; - } - if (grad_voxel_feats_element_num != 0) { - PARAM_CHECK(interface_name, grad_voxel_feats != NULL); - } - PARAM_CHECK(interface_name, feats != NULL); - if (mluOpGetTensorElementNum(voxel_feats_desc) != 0) { - PARAM_CHECK(interface_name, voxel_feats != NULL); - } - PARAM_CHECK(interface_name, point2voxel_map != NULL); - if (mluOpGetTensorElementNum(voxel_points_count_desc) != 0) { - PARAM_CHECK(interface_name, voxel_points_count != NULL); - } - PARAM_CHECK(interface_name, voxel_num != NULL); - PARAM_CHECK(interface_name, grad_feats != NULL); - if (workspace_size != 0) { - PARAM_CHECK(interface_name, workspace != NULL); - } - return MLUOP_STATUS_SUCCESS; -} - -static void policyFunc(const mluOpHandle_t handle, cnrtDim3_t *k_dim, - cnrtFunctionType_t *k_type, int N) { - int max_core_num = mluop::runtime::getCoreNumOfJobLimitCapability(handle); - size_t core_num = handle->core_num_per_cluster; - if (N > max_core_num) { - k_dim->x = max_core_num; - *k_type = mluop::runtime::getJobLimitCapabilityCnrtFuncType(handle); - } else { - if (N <= 4) { - k_dim->x = core_num * 1; - *k_type = CNRT_FUNC_TYPE_UNION1; - } else if (N <= 8) { - k_dim->x = core_num * 2; - *k_type = CNRT_FUNC_TYPE_UNION2; - } else if (N <= 16) { - k_dim->x = core_num * 4; - *k_type = CNRT_FUNC_TYPE_UNION4; - } else if (N <= 32) { - k_dim->x = core_num * 8; - *k_type = CNRT_FUNC_TYPE_UNION8; - } else if (N <= 64) { - k_dim->x = core_num * 16; - *k_type = CNRT_FUNC_TYPE_UNION16; - } else { - LOG(ERROR) - << "[mluOpDynamicPointToVoxelBackward]: failed to choose kernel " - "to launch"; - return; - } - } - k_dim->y = 1; - k_dim->z = 1; - VLOG(5) << "Launch Kernel MLUKernelDynamicPointToVoxelBackward in UNION" - << *k_type / 4 << " type"; -} - -mluOpStatus_t MLUOP_WIN_API mluOpDynamicPointToVoxelBackward( - const mluOpHandle_t handle, const mluOpReduceMode_t reduce_type, - const mluOpTensorDescriptor_t grad_voxel_feats_desc, - const void *grad_voxel_feats, const mluOpTensorDescriptor_t feats_desc, - const void *feats, const mluOpTensorDescriptor_t voxel_feats_desc, - const void *voxel_feats, const mluOpTensorDescriptor_t point2voxel_map_desc, - const void *point2voxel_map, - const mluOpTensorDescriptor_t voxel_points_count_desc, - const void *voxel_points_count, - const mluOpTensorDescriptor_t voxel_num_desc, const void *voxel_num, - void *workspace, const size_t workspace_size, - const mluOpTensorDescriptor_t grad_feats_desc, void *grad_feats) { - const char *interface_name = "[mluOpDynamicPointToVoxelBackward]"; - bool zero_element = false; - mluOpStatus_t param_check = DynamicPointToVoxelBackwardParamCheck( - interface_name, handle, reduce_type, grad_voxel_feats_desc, - grad_voxel_feats, feats_desc, feats, voxel_feats_desc, voxel_feats, - point2voxel_map_desc, point2voxel_map, voxel_points_count_desc, - voxel_points_count, voxel_num_desc, voxel_num, workspace, workspace_size, - grad_feats_desc, grad_feats, zero_element); - if (param_check != MLUOP_STATUS_SUCCESS) { - return param_check; - } - if (zero_element) { - VLOG(5) << interface_name << " Skip zero element tensor."; - return MLUOP_STATUS_SUCCESS; - } - - // generator - if (MLUOP_GEN_CASE_ON_NEW) { - GEN_CASE_START("dynamic_point_to_voxel_backward"); - GEN_CASE_HANDLE(handle); - GEN_CASE_DATA_REAL(true, "grad_voxel_feats", grad_voxel_feats, - grad_voxel_feats_desc); - GEN_CASE_DATA_REAL(true, "feats", feats, feats_desc); - GEN_CASE_DATA_REAL(true, "voxel_feats", voxel_feats, voxel_feats_desc); - GEN_CASE_DATA_REAL(true, "point2voxel_map", point2voxel_map, - point2voxel_map_desc); - GEN_CASE_DATA_REAL(true, "voxel_points_count", voxel_points_count, - voxel_points_count_desc); - GEN_CASE_DATA_REAL(true, "voxel_num", voxel_num, voxel_num_desc); - GEN_CASE_DATA(false, "grad_feats", grad_feats, grad_feats_desc, 0, 0); - GEN_CASE_OP_PARAM_SINGLE(0, "dynamic_point_to_voxel_backward", - "reduce_type", reduce_type); - GEN_CASE_TEST_PARAM_NEW(false, false, true, 0.003, 0.003, 0); - } - - const int N = feats_desc->dims[0]; - const int C = feats_desc->dims[1]; - const auto grad_voxel_feats_element_num = - mluOpGetTensorElementNum(grad_voxel_feats_desc); - const auto grad_feats_element_num = mluOpGetTensorElementNum(grad_feats_desc); - VLOG(5) << interface_name << " N = " << N << ", C = " << C - << ", grad_voxel_feats_element_num=" << grad_voxel_feats_element_num - << ", grad_feats_element_num=" << grad_feats_element_num; - // 1. init output - uint64_t fill_0 = 0x0; - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_feats_desc, - cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_0, - cnnl_output_desc, grad_feats)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - - cnrtDim3_t k_dim; - cnrtFunctionType_t k_type; - policyFunc(handle, &k_dim, &k_type, N); - if (grad_voxel_feats_element_num != 0) { - // 2. init workspace - mluOpTensorDescriptor_t indices_desc; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&indices_desc)); - int indices_dims[2] = {(int)grad_voxel_feats_element_num, 1}; - INTERNAL_CHECK(interface_name, MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor( - indices_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_INT32, 2, indices_dims)); - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(indices_desc, - cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, - &grad_feats_element_num, cnnl_output_desc, - workspace)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - // 3. get scatter indices - CHECK_RETURN("[mluOpDynamicPointToVoxelBackward]", - KernelDynamicPointToVoxelBackward( - k_dim, k_type, handle->queue, feats, voxel_feats, - grad_feats, workspace, point2voxel_map, voxel_num, N, C)); - // 4. scatter - cnnlScatterNdMode_t scatter_mode = CNNL_SCATTERND_ADD; - mluOpTensorDescriptor_t updates_desc; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&updates_desc)); - int updates_dims[1] = {(int)grad_voxel_feats_element_num}; - INTERNAL_CHECK(interface_name, MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor( - updates_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_FLOAT, 1, updates_dims)); - mluOpTensorDescriptor_t output_desc; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&output_desc)); - int output_dims[1] = {(int)grad_feats_element_num}; - INTERNAL_CHECK(interface_name, MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor( - output_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_FLOAT, 1, output_dims)); - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(indices_desc, - cnnl_indices_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(updates_desc, - cnnl_updates_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_desc, - cnnl_output_desc); - - CALL_CNNL(cnnlScatterNd_v2(cnnl_handle, scatter_mode, cnnl_indices_desc, - workspace, cnnl_updates_desc, grad_voxel_feats, - NULL, NULL, cnnl_output_desc, grad_feats)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_updates_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(updates_desc)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(output_desc)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(indices_desc)); - } - GEN_CASE_END(); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpGetDynamicPointToVoxelBackwardWorkspaceSize( - const mluOpHandle_t handle, const mluOpReduceMode_t reduce_type, - const mluOpTensorDescriptor_t grad_voxel_feats_desc, - const mluOpTensorDescriptor_t feats_desc, - const mluOpTensorDescriptor_t voxel_feats_desc, - const mluOpTensorDescriptor_t point2voxel_map_desc, - const mluOpTensorDescriptor_t voxel_points_count_desc, - const mluOpTensorDescriptor_t voxel_num_desc, size_t *workspace_size) { - const char *interface_name = - "[mluOpGetDynamicPointToVoxelBackwardWorkspaceSize]"; - PARAM_CHECK(interface_name, handle != NULL); - if (handle->arch < MLUOP_MLU370) { - LOG(ERROR) << interface_name - << "Only mlu300 and above devices are supported. " - << "Please check the device version!"; - return MLUOP_STATUS_ARCH_MISMATCH; - } - PARAM_CHECK(interface_name, grad_voxel_feats_desc != NULL); - PARAM_CHECK(interface_name, feats_desc != NULL); - PARAM_CHECK(interface_name, voxel_feats_desc != NULL); - PARAM_CHECK(interface_name, point2voxel_map_desc != NULL); - PARAM_CHECK(interface_name, voxel_points_count_desc != NULL); - PARAM_CHECK(interface_name, voxel_num_desc != NULL); - PARAM_CHECK(interface_name, workspace_size != NULL); - const int N = feats_desc->dims[0]; - const int C = feats_desc->dims[1]; - *workspace_size = N * C * sizeof(int); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.h b/kernels/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.h deleted file mode 100644 index 2a4df91b4..000000000 --- a/kernels/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.h +++ /dev/null @@ -1,37 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#ifndef KERNELS_DYNAMIC_POINT_TO_VOXEL_BACKWARD_\ -DYNAMIC_POINT_TO_VOXEL_BACKWARD_H -#define KERNELS_DYNAMIC_POINT_TO_VOXEL_BACKWARD_\ -DYNAMIC_POINT_TO_VOXEL_BACKWARD_H - -#include "mlu_op.h" - -mluOpStatus_t MLUOP_WIN_API KernelDynamicPointToVoxelBackward( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - const void *feats, const void *voxel_feats, void *grad_feats, - void *voxel_from, const void *point2voxel_map, const void *voxel_num, - const int N, const int C); - -#endif // KERNELS_DYNAMIC_POINT_TO_VOXEL_BACKWARD_ - // DYNAMIC_POINT_TO_VOXEL_FORWARD_H diff --git a/kernels/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward_union1.mlu b/kernels/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward_union1.mlu deleted file mode 100644 index 9e7d137b9..000000000 --- a/kernels/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward_union1.mlu +++ /dev/null @@ -1,201 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "dynamic_point_to_voxel_backward.h" - -#include "core/logging.h" -#include "kernels/debug.h" -#include "kernels/kernel.h" -#include "kernels/utils/common.h" - -__nram__ char nram_buffer[MAX_NRAM_SIZE]; - -template -__mlu_func__ void loadAsync(T *feats_nram, T *voxel_feats_nram, - int *index_mask_nram, int *voxel_from_nram, - int *point2voxel_map_real_nram, - const int *point2voxel_map_nram, - const int *index_col_nram, const T *feats, - const T *voxel_feats, const int *voxel_from, int &x, - int &n_real, const int n_limit, const int N, - const int C) { - int invalid_index = -1; - int size_feats = C * sizeof(T); - int size_feats_idx = C * sizeof(int); - n_real = 0; - for (; x < N && n_real < n_limit; x++) { - int point_to = point2voxel_map_nram[x]; - int input_offset = x * C; - int input_real_offset = n_real * C; - if (taskId == point_to % taskDim) { - if (point_to == invalid_index) { - continue; - } - int reduced_offset = point_to * C; - // load valid data to feats_nram - __memcpy_async(feats_nram + input_real_offset, feats + input_offset, - size_feats, GDRAM2NRAM); - // boradcast voxel_feats data to voxel_feats_nram via the same "point_to" - __memcpy_async(voxel_feats_nram + input_real_offset, - voxel_feats + reduced_offset, size_feats, GDRAM2NRAM); - // boradcast voxel_from data to voxel_from_nram via the same "point_to" - __memcpy_async(voxel_from_nram + input_real_offset, - voxel_from + reduced_offset, size_feats_idx, GDRAM2NRAM); - // record valid index of x in index_mask_nram - __bang_write_value(index_mask_nram + input_real_offset, C, x * C); - // point2voxel_map removed invalid data - point2voxel_map_real_nram[n_real] = point_to; - ++n_real; - } - } - if (n_real > 0) { - __bang_cycle_add(index_mask_nram, index_mask_nram, index_col_nram, - n_real * C, C); - } -} - -template -__mlu_func__ void compute(T *feats_nram, T *voxel_feats_nram, - int *index_mask_nram, int *voxel_from_nram, - const int n_real, const int N, const int C) { - if (n_real > 0) { - // view [n_real, C] as [n_real * C] - int deal_num = n_real * C; - // if (feats[i] == voxel_feats[i]) {mask[i] = 1} else {mask[i] = 0} - __bang_eq(feats_nram, voxel_feats_nram, feats_nram, deal_num); - // change mask1's dtype to int32 - __bang_float2int32_tz((int *)feats_nram, feats_nram, deal_num, 0); - // mask2 = NOT mask1 - __bang_not((int *)voxel_feats_nram, (int *)feats_nram, deal_num); - // choose index of "feats[i] == voxel_feats[i]" - __bang_mul((int *)feats_nram, (int *)feats_nram, index_mask_nram, deal_num); - // mask2 *= N * C - __bang_mul_scalar((int *)voxel_feats_nram, (int *)voxel_feats_nram, N * C, - deal_num); - // mix choosed index and 'N * C' - __bang_add(index_mask_nram, (int *)voxel_feats_nram, (int *)feats_nram, - deal_num); - // choose the min index - __bang_minequal(voxel_from_nram, voxel_from_nram, index_mask_nram, - deal_num); - } -} - -__mlu_func__ void storeAsync(int *voxel_from, const int *voxel_from_nram, - const int *point2voxel_map_real_nram, - bool *voxel_from_flag_nram, int *index_mask_nram, - const int n_real, const int N, const int C) { - int size_feats_idx = C * sizeof(int); - for (int i = 0; i < n_real; i++) { - int offset_real = point2voxel_map_real_nram[i]; - // 1) use atomicmin, too slow - // __bang_atomic_reduce_min(voxel_from + offset_real * C, - // voxel_from_nram + i * C, C); - // 2) compare one by one, use voxel_from_flag_nram as flags to record - // whether dst idx has appeard - if (voxel_from_flag_nram[offset_real] == false) { - // if number of grad idx on offset_real == 1, use the idx value directly - __memcpy_async(voxel_from + offset_real * C, voxel_from_nram + i * C, - size_feats_idx, NRAM2GDRAM); - // set voxel_from_flag to true - voxel_from_flag_nram[offset_real] = true; - } else { - __sync_io(); - // load the idx appeard - __memcpy(index_mask_nram, voxel_from + offset_real * C, size_feats_idx, - GDRAM2NRAM); - // if number of grad idx on offset_real > 1, pick the min idx value - __bang_minequal(index_mask_nram, index_mask_nram, voxel_from_nram + i * C, - C); - // store the new idx - __memcpy(voxel_from + offset_real * C, index_mask_nram, size_feats_idx, - NRAM2GDRAM); - } - } -} - -template -__mlu_global__ void MLUKernelMaxReduceTracebackScatterIdx( - const T *feats, const T *voxel_feats, T *grad_feats, int *voxel_from, - const int *point2voxel_map, const int *voxel_num, const int N, - const int C) { - const int M = *voxel_num; - if (M == 0) { - return; - } - int size_input = N * sizeof(int); - int size_reduced_flag = M * sizeof(bool); - int size_feats = C * sizeof(T); - int size_feats_idx = C * sizeof(int); - - int nram_size = MAX_NRAM_SIZE; - int n_limit = (nram_size - size_input - size_reduced_flag - size_feats_idx) / - (2 * size_feats + 2 * size_feats_idx + sizeof(int)); - int feats_limit = n_limit * C; - - T *feats_nram = (T *)nram_buffer; // [n_limit, C] - T *voxel_feats_nram = feats_nram + feats_limit; // [n_limit, C] - int *index_mask_nram = - (int *)(voxel_feats_nram + feats_limit); // [n_limit, C] - int *voxel_from_nram = index_mask_nram + feats_limit; // [n_limit, C] - int *point2voxel_map_nram = voxel_from_nram + feats_limit; // [N] - int *point2voxel_map_real_nram = point2voxel_map_nram + N; // [n_limit] - bool *voxel_from_flag_nram = - (bool *)(point2voxel_map_real_nram + n_limit); // [M] - int *index_col_nram = (int *)(voxel_from_flag_nram + M); // [C] - - __sync_all(); - - // broadcast point2voxel_map to nram - __memcpy(point2voxel_map_nram, point2voxel_map, size_input, GDRAM2NRAM); - // initialze voxel_from_flag to false - __memset_nram(voxel_from_flag_nram, M, (char)false); - for (int i = 0; i < C; i++) { - index_col_nram[i] = i; - } - for (int x = 0, n_real = 0; x < N;) { - // load data, get x and n_real - loadAsync(feats_nram, voxel_feats_nram, index_mask_nram, voxel_from_nram, - point2voxel_map_real_nram, point2voxel_map_nram, index_col_nram, - feats, voxel_feats, voxel_from, x, n_real, n_limit, N, C); - __sync(); - // compute - compute(feats_nram, voxel_feats_nram, index_mask_nram, voxel_from_nram, - n_real, N, C); - // store - storeAsync(voxel_from, voxel_from_nram, point2voxel_map_real_nram, - voxel_from_flag_nram, index_mask_nram, n_real, N, C); - __sync(); - } -} - -mluOpStatus_t MLUOP_WIN_API KernelDynamicPointToVoxelBackward( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - const void *feats, const void *voxel_feats, void *grad_feats, - void *voxel_from, const void *point2voxel_map, const void *voxel_num, - const int N, const int C) { - KERNEL_CHECK(MLUKernelMaxReduceTracebackScatterIdx<<>>( - (const float *)feats, (const float *)voxel_feats, (float *)grad_feats, - (int *)voxel_from, (const int *)point2voxel_map, (const int *)voxel_num, - N, C)); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.cpp b/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.cpp deleted file mode 100644 index 1a42bcc1d..000000000 --- a/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.cpp +++ /dev/null @@ -1,337 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "dynamic_point_to_voxel_forward.h" - -#include - -#include "core/gen_case.h" -#include "core/logging.h" -#include "core/runtime/device.h" -#include "core/tensor.h" -#include "core/type.h" -#include "kernels/kernel.h" -#include "kernels/utils/cnnl_helper.h" - -// policy function -static void policyFuncDynamicPointToVoxelForward(const mluOpHandle_t handle, - cnrtDim3_t *k_dim, - cnrtFunctionType_t *k_type, - const int nums) { - int max_core_num = mluop::runtime::getCoreNumOfJobLimitCapability(handle); - size_t core_num = handle->core_num_per_cluster; - if (nums > max_core_num) { - k_dim->x = max_core_num; - *k_type = mluop::runtime::getJobLimitCapabilityCnrtFuncType(handle); - } else { - if (nums == 1) { - k_dim->x = 1; - *k_type = CNRT_FUNC_TYPE_BLOCK; - } else if (nums <= 4) { - k_dim->x = core_num * 1; - *k_type = CNRT_FUNC_TYPE_UNION1; - } else if (nums <= 8) { - k_dim->x = core_num * 2; - *k_type = CNRT_FUNC_TYPE_UNION2; - } else if (nums <= 16) { - k_dim->x = core_num * 4; - *k_type = CNRT_FUNC_TYPE_UNION4; - } else if (nums <= 32) { - k_dim->x = core_num * 8; - *k_type = CNRT_FUNC_TYPE_UNION8; - } else if (nums <= 64) { - k_dim->x = core_num * 16; - *k_type = CNRT_FUNC_TYPE_UNION16; - } - } - k_dim->y = 1; - k_dim->z = 1; - return; -} - -static mluOpStatus_t DynamicPointToVoxelForwardParamCheck( - const std::string &api, const mluOpHandle_t handle, - const mluOpReduceMode_t reduce_type, const void *feats, const void *coors, - const void *voxel_feats, const void *voxel_coors, - const void *point2voxel_map, const void *voxel_points_count, - const void *voxel_num, void *workspace, const size_t workspace_size, - const mluOpTensorDescriptor_t feats_desc, - const mluOpTensorDescriptor_t coors_desc, - const mluOpTensorDescriptor_t voxel_feats_desc, - const mluOpTensorDescriptor_t voxel_coors_desc, - const mluOpTensorDescriptor_t point2voxel_map_desc, - const mluOpTensorDescriptor_t voxel_points_count_desc, - const mluOpTensorDescriptor_t voxel_num_desc, bool *zero_element) { - // check descriptor - PARAM_CHECK(api, handle != NULL); - // platform check - if (handle->arch < MLUOP_MLU370) { - LOG(ERROR) << api << "Only mlu300 and above devices are supported. " - << "Please check the device version!"; - return MLUOP_STATUS_ARCH_MISMATCH; - } - PARAM_CHECK(api, feats_desc != NULL); - PARAM_CHECK(api, coors_desc != NULL); - PARAM_CHECK(api, voxel_feats_desc != NULL); - PARAM_CHECK(api, voxel_coors_desc != NULL); - PARAM_CHECK(api, point2voxel_map_desc != NULL); - PARAM_CHECK(api, voxel_points_count_desc != NULL); - PARAM_CHECK(api, voxel_num_desc != NULL); - // check shape - PARAM_CHECK(api, feats_desc->dim == 2); - PARAM_CHECK(api, coors_desc->dim == 2); - PARAM_CHECK(api, voxel_feats_desc->dim == 2); - PARAM_CHECK(api, voxel_coors_desc->dim == 2); - PARAM_CHECK(api, point2voxel_map_desc->dim == 1); - PARAM_CHECK(api, voxel_points_count_desc->dim == 1); - PARAM_CHECK(api, voxel_num_desc->dim == 1); - - // check data type - PARAM_CHECK_V2(api, (feats_desc->dtype == MLUOP_DTYPE_FLOAT), - "Only float are supported in feats tensor, but the data " - "type of tensor is " - << mluOpGetNameOfDataType(feats_desc->dtype) << "."); - PARAM_CHECK_V2(api, (coors_desc->dtype == MLUOP_DTYPE_INT32), - "Only int32 are supported in coors tensor, but the data " - "type of tensor is " - << mluOpGetNameOfDataType(coors_desc->dtype) << "."); - PARAM_CHECK_V2( - api, (point2voxel_map_desc->dtype == MLUOP_DTYPE_INT32), - "Only int32 are supported in point2voxel_map tensor, but the data " - "type of tensor is " - << mluOpGetNameOfDataType(point2voxel_map_desc->dtype) << "."); - - PARAM_CHECK(api, voxel_feats_desc->dtype == feats_desc->dtype); - PARAM_CHECK(api, voxel_coors_desc->dtype == coors_desc->dtype); - PARAM_CHECK(api, - voxel_points_count_desc->dtype == point2voxel_map_desc->dtype); - PARAM_CHECK(api, voxel_num_desc->dtype == point2voxel_map_desc->dtype); - - if (reduce_type != MLUOP_REDUCE_DMAX && reduce_type != MLUOP_REDUCE_DMEAN) { - LOG(ERROR) << api << "Only support max and mean. " - << "Please check reduce_type!"; - return MLUOP_STATUS_BAD_PARAM; - } - - // check dim - PARAM_CHECK(api, feats_desc->dims[0] == coors_desc->dims[0]); - PARAM_CHECK(api, feats_desc->dims[0] == point2voxel_map_desc->dims[0]); - PARAM_CHECK(api, voxel_feats_desc->dims[0] == voxel_coors_desc->dims[0]); - PARAM_CHECK(api, - voxel_feats_desc->dims[0] == voxel_points_count_desc->dims[0]); - PARAM_CHECK(api, voxel_num_desc->dims[0] == 1); - PARAM_CHECK(api, feats_desc->dims[1] == voxel_feats_desc->dims[1]); - PARAM_CHECK(api, coors_desc->dims[1] == voxel_coors_desc->dims[1]); - PARAM_CHECK(api, coors_desc->dims[1] == 3); - PARAM_CHECK(api, feats_desc->dims[0] >= voxel_feats_desc->dims[0]); - - // check large tensor - const size_t feats_element_num = mluOpGetTensorElementNum(feats_desc); - const size_t coors_element_num = mluOpGetTensorElementNum(coors_desc); - TENSOR_NUM_CHECK(api, feats_element_num, LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK(api, coors_element_num, LARGE_TENSOR_NUM, ""); - - // check element num zero - if (feats_element_num == 0 || coors_element_num == 0) { - *zero_element = true; - return MLUOP_STATUS_SUCCESS; - } - - // check workspace ptr - if (workspace_size > 0) { - PARAM_CHECK(api, workspace != NULL); - } - // input and output ptr check null - PARAM_CHECK(api, feats != NULL); - PARAM_CHECK(api, coors != NULL); - PARAM_CHECK(api, voxel_feats != NULL); - PARAM_CHECK(api, voxel_coors != NULL); - PARAM_CHECK(api, point2voxel_map != NULL); - PARAM_CHECK(api, voxel_points_count != NULL); - PARAM_CHECK(api, voxel_num != NULL); - - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpGetDynamicPointToVoxelForwardWorkspaceSize( - mluOpHandle_t handle, const mluOpTensorDescriptor_t feats_desc, - const mluOpTensorDescriptor_t coors_desc, size_t *workspace_size) { - const std::string api = "[mluOpGetDynamicPointToVoxelForwardWorkspaceSize]"; - PARAM_CHECK(api, handle != NULL); - // platform check - if (handle->arch < MLUOP_MLU370) { - LOG(ERROR) << "[mluOpGetDynamicPointToVoxelForwardWorkspaceSize] Only " - "mlu300 and above " - "devices are supported. " - << "Please check the device version!"; - return MLUOP_STATUS_ARCH_MISMATCH; - } - - PARAM_CHECK(api, feats_desc != NULL); - PARAM_CHECK(api, coors_desc != NULL); - PARAM_CHECK(api, workspace_size != NULL); - - { - cnnlUniqueSort_t unique_mode = CNNL_SORT_ASCEND; - cnnlUniqueDescriptor_t unique_desc; - - CALL_CNNL(cnnlCreateUniqueDescriptor(&unique_desc)); - CALL_CNNL(cnnlSetUniqueDescriptor(unique_desc, unique_mode, 0, true, true)); - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(coors_desc, cnnl_input_desc); - CALL_CNNL(cnnlGetUniqueWorkspaceSize(cnnl_handle, unique_desc, - cnnl_input_desc, workspace_size)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - - CALL_CNNL(cnnlDestroyUniqueDescriptor(unique_desc)); - } - - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpDynamicPointToVoxelForward( - const mluOpHandle_t handle, const mluOpReduceMode_t reduce_type, - const mluOpTensorDescriptor_t feats_desc, const void *feats, - const mluOpTensorDescriptor_t coors_desc, void *coors, void *workspace, - const size_t workspace_size, const mluOpTensorDescriptor_t voxel_feats_desc, - void *voxel_feats, const mluOpTensorDescriptor_t voxel_coors_desc, - void *voxel_coors, const mluOpTensorDescriptor_t point2voxel_map_desc, - void *point2voxel_map, - const mluOpTensorDescriptor_t voxel_points_count_desc, - void *voxel_points_count, const mluOpTensorDescriptor_t voxel_num_desc, - void *voxel_num) { - const std::string api = "[mluOpDynamicPointToVoxelForward]"; - // check params - bool zero_element = false; - - mluOpStatus_t ret = DynamicPointToVoxelForwardParamCheck( - api, handle, reduce_type, feats, coors, voxel_feats, voxel_coors, - point2voxel_map, voxel_points_count, voxel_num, workspace, workspace_size, - feats_desc, coors_desc, voxel_feats_desc, voxel_coors_desc, - point2voxel_map_desc, voxel_points_count_desc, voxel_num_desc, - &zero_element); - - if (ret != MLUOP_STATUS_SUCCESS) { - LOG(ERROR) << api - << " Error found during element verification, please check."; - return ret; - } - // check zero element - if (zero_element) { - VLOG(5) << "[mluOpDynamicPointToVoxelForward] Skip zero element tensor."; - return MLUOP_STATUS_SUCCESS; - } - // generator - if (MLUOP_GEN_CASE_ON_NEW) { - GEN_CASE_START("dynamic_point_to_voxel_forward"); - GEN_CASE_HANDLE(handle); - GEN_CASE_DATA(true, "feats", feats, feats_desc, -100, 100); - GEN_CASE_DATA_REAL(true, "coors", coors, coors_desc); - GEN_CASE_DATA(false, "voxel_feats", voxel_feats, voxel_feats_desc, 0, 0); - GEN_CASE_DATA(false, "voxel_coors", voxel_coors, voxel_coors_desc, 0, 0); - GEN_CASE_DATA(false, "point2voxel_map", point2voxel_map, - point2voxel_map_desc, 0, 0); - GEN_CASE_DATA(false, "voxel_points_count", voxel_points_count, - voxel_points_count_desc, 0, 0); - GEN_CASE_DATA(false, "voxel_num", voxel_num, voxel_num_desc, 0, 0); - GEN_CASE_OP_PARAM_SINGLE(0, "dynamic_point_to_voxel_forward", "reduce_type", - reduce_type); - GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0); - } - - const int num_points = feats_desc->dims[0]; - const int num_feats = feats_desc->dims[1]; - cnrtDim3_t k_dim; - cnrtFunctionType_t k_type; - policyFuncDynamicPointToVoxelForward(handle, &k_dim, &k_type, num_points); - VLOG(5) << api << " Launch [" << k_type << ", " << k_dim.x << ", " << k_dim.y - << ", " << k_dim.z << "]."; - // 1. mask_fill coors - VLOG(5) << api << " launch KernelMaskFillCoorsForward start."; - CHECK_RETURN("[MaskFillCoorsForward]", - KernelMaskFillCoorsForward(k_dim, k_type, handle->queue, - num_points, coors)); - VLOG(5) << api << " launch KernelMaskFillCoorsForward end."; - - // 2. unique op - { - cnnlUniqueSort_t unique_mode = CNNL_SORT_ASCEND; - cnnlUniqueDescriptor_t unique_desc; - - CALL_CNNL(cnnlCreateUniqueDescriptor(&unique_desc)); - CALL_CNNL(cnnlSetUniqueDescriptor(unique_desc, unique_mode, 0, true, true)); - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(coors_desc, cnnl_input_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(voxel_coors_desc, - cnnl_output_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(point2voxel_map_desc, - cnnl_indices_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(voxel_points_count_desc, - cnnl_counts_desc); - - CALL_CNNL(cnnlUnique_v2(cnnl_handle, unique_desc, cnnl_input_desc, coors, - workspace, workspace_size, (int *)voxel_num, - cnnl_output_desc, voxel_coors, cnnl_indices_desc, - point2voxel_map, cnnl_counts_desc, - voxel_points_count)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_counts_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - - CALL_CNNL(cnnlDestroyUniqueDescriptor(unique_desc)); - } - - // 3. reduce - // fill -inf or zero - VLOG(5) << "cnnlFill_v3 min value start."; - float inf_value = 0x0; - if (reduce_type == MLUOP_REDUCE_DMAX) { - inf_value = -INFINITY; - } - const float fill_value = inf_value; - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(voxel_feats_desc, - cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value, - cnnl_output_desc, voxel_feats)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - VLOG(5) << "cnnlFill_v3 min value end."; - - VLOG(5) << api << " launch KernelDynamicPointToVoxelForward start."; - CHECK_RETURN("[mluOpDynamicPointToVoxelForward]", - KernelDynamicPointToVoxelForward( - k_dim, k_type, handle->queue, reduce_type, feats, num_points, - num_feats, voxel_coors, voxel_num, point2voxel_map, - voxel_points_count, voxel_feats)); - VLOG(5) << api << " launch KernelDynamicPointToVoxelForward end."; - - GEN_CASE_END(); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.h b/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.h deleted file mode 100644 index a2a64c866..000000000 --- a/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.h +++ /dev/null @@ -1,39 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#ifndef KERNELS_DYNAMIC_POINT_TO_VOXEL_FORWARD_DYNAMIC_POINT_TO_VOXEL_FORWARD_H -#define KERNELS_DYNAMIC_POINT_TO_VOXEL_FORWARD_DYNAMIC_POINT_TO_VOXEL_FORWARD_H - -#include "mlu_op.h" - -mluOpStatus_t MLUOP_WIN_API -KernelMaskFillCoorsForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, - cnrtQueue_t queue, int32_t num_points, void *coors); - -mluOpStatus_t MLUOP_WIN_API KernelDynamicPointToVoxelForward( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpReduceMode_t reduce_mode, const void *feats, int32_t num_points, - int32_t num_voxel, void *voxel_coors, void *voxel_num, - void *point2voxel_map, void *voxel_points_count, void *voxel_feats); - -#endif // KERNELS_DYNAMIC_POINT_TO_VOXEL_FORWARD_DYNAMIC_ - // POINT_TO_VOXEL_FORWARD_H diff --git a/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward_union1.mlu b/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward_union1.mlu deleted file mode 100644 index b0c7d8711..000000000 --- a/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward_union1.mlu +++ /dev/null @@ -1,338 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "dynamic_point_to_voxel_forward.h" - -#include "core/logging.h" -#include "kernels/debug.h" -#include "kernels/kernel.h" -#include "kernels/utils/common.h" - -__nram__ char nram_buffer[MAX_NRAM_SIZE]; - -#define COORS_IDX 1 -#define COORS_XYZ 3 - -__mlu_func__ void load(const float *input_addr, float *nram_input, - const int deal_num, const int pi) { - int offset = (pi % 2) * 2 * deal_num; - float *nram_input_p = nram_input + offset; - __memcpy_async(nram_input_p, input_addr, deal_num * sizeof(float), - GDRAM2NRAM); -} - -__mlu_func__ void compute(float *nram_input, int *nram_points_count, - const int deal_num, const int pi) { - int offset = (pi % 2) * 2 * deal_num; - float *nram_input_p = nram_input + offset; - float *nram_output_p = nram_input + offset + deal_num; -#if (__BANG_ARCH__ >= 322) && (__BANG_ARCH__ != 372) - __bang_div(nram_output_p, nram_input_p, (float)(nram_points_count[pi]), - deal_num); -#else - __bang_mul_scalar(nram_output_p, nram_input_p, - 1.0 / (float)nram_points_count[pi], deal_num); -#endif -} - -__mlu_func__ void store(float *output_addr, float *nram_output, - const int deal_num, const int pi) { - int offset = (pi % 2) * 2 * deal_num; - float *nram_output_p = nram_output + offset + deal_num; - __memcpy_async(output_addr, nram_output_p, deal_num * sizeof(float), - NRAM2GDRAM); -} - -__mlu_func__ void lcsFunc(float *base_input_addr, int *base_points_count, - float *nram_input, const int repeat_num, - const int rem_num, const int deal_h) { - float *input_addr = NULL; - float *output_addr = NULL; - if (repeat_num > 0) { - input_addr = base_input_addr; - load(input_addr, nram_input, deal_h, 0); - __sync(); - } - - if (repeat_num > 1) { - // L(vi=1) - input_addr = base_input_addr + deal_h; - load(input_addr, nram_input, deal_h, 1); - // C(vi=0) - compute(nram_input, base_points_count, deal_h, 0); - __sync(); - } - - for (int v_iter = 0; v_iter < repeat_num - 2; v_iter++) { - // S(vi) - output_addr = base_input_addr + v_iter * deal_h; - store(output_addr, nram_input, deal_h, v_iter); - // C(vi+1) - compute(nram_input, base_points_count, deal_h, v_iter + 1); - // L(vi+2) - input_addr = base_input_addr + (v_iter + 2) * deal_h; - load(input_addr, nram_input, deal_h, v_iter + 2); - __sync_io_move_compute(); - } - - if (repeat_num > 1) { - // S(vi = repeat_num - 2) - output_addr = base_input_addr + (repeat_num - 2) * deal_h; - store(output_addr, nram_input, deal_h, repeat_num - 2); - } - if (rem_num > 0) { - // L[repeat_num] - input_addr = base_input_addr + repeat_num * deal_h; - load(input_addr, nram_input, rem_num, repeat_num); - } - if (repeat_num > 0) { - // C[repeat_num - 1] - compute(nram_input, base_points_count, deal_h, repeat_num - 1); - } - __sync(); - if (repeat_num > 0) { - // S[repeat_num - 1] - output_addr = base_input_addr + (repeat_num - 1) * deal_h; - store(output_addr, nram_input, deal_h, repeat_num - 1); - } - if (rem_num > 0) { - // C[repeat_num] - compute(nram_input, base_points_count, rem_num, repeat_num); - __sync(); - // S[repeat_num] - output_addr = base_input_addr + repeat_num * deal_h; - store(output_addr, nram_input, deal_h, repeat_num); - } -} - -__mlu_global__ void MLUKernelDynamicPointToVoxelForward( - mluOpReduceMode_t reduce_mode, const float *feats, int32_t num_points, - int32_t num_feats, int32_t *voxel_coors, int32_t *voxel_num, - int *point2voxel_map, int32_t *voxel_points_count, float *voxel_feats) { -#if __BANG_ARCH__ >= 372 - if (__is_mpu()) { - return; - } - bool reduce_map = false; - if (voxel_coors[0] == -1) { - reduce_map = true; - } - __sync_all_ipu(); - if (voxel_coors[0] == -1) { - if (taskId == 0) { - int32_t num_voxel = voxel_num[0] - 1; - __gdramset(voxel_num, 1, num_voxel); - __memcpy_async(voxel_coors, voxel_coors + COORS_XYZ, - (num_voxel + 1) * COORS_XYZ * sizeof(int32_t), - GDRAM2GDRAM); - __memcpy_async(voxel_points_count, voxel_points_count + COORS_IDX, - (num_voxel + 1) * COORS_IDX * sizeof(int32_t), - GDRAM2GDRAM); - __sync(); - } - } - __sync_all_ipu(); - - const int remainder = num_points % taskDim; - const int points_per_core = num_points / taskDim + (int)(taskId < remainder); - // offset of the point that core processes - const int points_offset = taskId * (num_points / taskDim) + - (taskId < remainder ? taskId : remainder); - // nram space - // |feats| - const int max_deal_h = ((MAX_NRAM_SIZE - sizeof(int32_t)) / sizeof(float)); - int deal_h = 0; - int deal_p = 0; - if (num_feats > max_deal_h) { - deal_p = 1; - deal_h = max_deal_h; - } else { - deal_h = num_feats; - deal_p = (MAX_NRAM_SIZE / (deal_h * sizeof(float) + sizeof(int))); - } - - float *nram_feats = (float *)nram_buffer; - int32_t *nram_map = (int32_t *)nram_feats + deal_p * deal_h; - const float *base_feats = feats + points_offset * num_feats; - int32_t *base_map = point2voxel_map + points_offset; - const int repeat_p = points_per_core / deal_p; - const int rem_p = points_per_core % deal_p; - const int repeat_h = num_feats / deal_h; - const int rem_h = num_feats % deal_h; - - for (int32_t p_iter = 0; p_iter <= repeat_p; p_iter++) { - int32_t deal_p_num = (p_iter < repeat_p) ? deal_p : rem_p; - if (deal_p_num == 0) { - break; - } - int32_t deal_p_num_offset = p_iter * deal_p * num_feats; - int32_t deal_map_offset = p_iter * deal_p * 1; - int32_t *base_map_addr = base_map + deal_map_offset; - // load map - __memcpy(nram_map, base_map_addr, deal_p_num * sizeof(int32_t), GDRAM2NRAM); - for (int32_t h_iter = 0; h_iter <= repeat_h; h_iter++) { - int32_t deal_h_num = (h_iter < repeat_h) ? deal_h : rem_h; - if (deal_h_num == 0) { - break; - } - int32_t deal_h_num_offset = deal_p_num_offset + h_iter * deal_h; - const float *base_feats_addr = base_feats + deal_h_num_offset; - // load - __memcpy_async(nram_feats, base_feats_addr, - deal_p_num * deal_h_num * sizeof(float), GDRAM2NRAM); - if (reduce_map) { - __bang_add_scalar(nram_map, nram_map, -1, deal_p_num); - } - __sync(); - // index and atomic - for (int32_t i = 0; i < deal_p_num; i++) { - int reduce_to = nram_map[i]; - if (reduce_to == -1) continue; - float *voxel_feats_offset = - voxel_feats + reduce_to * num_feats + h_iter * deal_h; - if (reduce_mode == MLUOP_REDUCE_DMAX) { - __bang_atomic_reduce_max(voxel_feats_offset, nram_feats + i * deal_h, - deal_h_num); - } else { - __bang_atomic_reduce_add(voxel_feats_offset, nram_feats + i * deal_h, - deal_h_num); - } - } - } - // store map - if (reduce_map) { - __memcpy(base_map_addr, nram_map, deal_p_num * sizeof(int32_t), - NRAM2GDRAM); - } - } - __sync_all_ipu(); - - int32_t num_voxel = voxel_num[0]; - if (reduce_mode == MLUOP_REDUCE_DMEAN) { - const int rem_voxel = num_voxel % taskDim; - const int voxel_per_core = num_voxel / taskDim + (int)(taskId < rem_voxel); - // offset of the point that core processes - const int voxel_offset = taskId * (num_voxel / taskDim) + - (taskId < rem_voxel ? taskId : rem_voxel); - // nram space - // |voxel_points_count| - // |voxel_feats_ping|voxel_feats_pong| - const int max_deal_h = - (MAX_NRAM_SIZE - sizeof(int32_t)) / (4 * sizeof(float)); - int deal_h = 0; - int deal_v = 0; - if (num_feats > max_deal_h) { - deal_v = 1; - deal_h = max_deal_h; - } else { - deal_h = num_feats; - deal_v = (MAX_NRAM_SIZE - 4 * deal_h * sizeof(float)) / (sizeof(int32_t)); - } - - int real_deal_v = deal_v > voxel_per_core ? voxel_per_core : deal_v; - - int *nram_points_count = (int *)nram_buffer; - float *voxel_feats_ping = (float *)(nram_points_count + real_deal_v); - int *base_points_count = (int *)voxel_points_count + voxel_offset; - float *base_voxel_feats = (float *)voxel_feats + voxel_offset * num_feats; - const int repeat_v = voxel_per_core / deal_v; - const int rem_v = voxel_per_core % deal_v; - const int repeat_h = num_feats / deal_h; - const int rem_h = num_feats % deal_h; - for (int v_iter = 0; v_iter <= repeat_v; v_iter++) { - int deal_v_num = (v_iter < repeat_v) ? deal_v : rem_v; - if (deal_v_num == 0) { - break; - } - float *base_voxel_feats_addr = - base_voxel_feats + v_iter * deal_v * num_feats; - int *base_points_count_addr = base_points_count + v_iter * deal_v; - __memcpy(nram_points_count, base_points_count_addr, - deal_v_num * sizeof(int), GDRAM2NRAM); - if (num_feats <= max_deal_h) { - // L(vi=0) - if (deal_v_num > 0) { - float *input_addr = base_voxel_feats_addr; - load(input_addr, voxel_feats_ping, deal_h, 0); - __sync(); - } - - if (deal_v_num > 1) { - // L(vi=1) - float *input_addr = base_voxel_feats_addr + deal_h; - load(input_addr, voxel_feats_ping, deal_h, 1); - // C(vi=0) - compute(voxel_feats_ping, nram_points_count, deal_h, 0); - __sync(); - } - - for (int vi = 0; vi < deal_v_num - 2; vi++) { - // S(vi) - float *output_addr = base_voxel_feats_addr + vi * deal_h; - store(output_addr, voxel_feats_ping, deal_h, vi); - // C(vi+1) - compute(voxel_feats_ping, nram_points_count, deal_h, vi + 1); - // L(vi+2) - float *input_addr = base_voxel_feats_addr + (vi + 2) * deal_h; - load(input_addr, voxel_feats_ping, deal_h, vi + 2); - __sync(); - } - - if (deal_v_num > 1) { - // S(vi = deal_v_num - 2) - float *output_addr = - base_voxel_feats_addr + (deal_v_num - 2) * deal_h; - store(output_addr, voxel_feats_ping, deal_h, deal_v_num - 2); - __sync(); - } - if (deal_v_num > 0) { - // C[deal_v_num - 1] - compute(voxel_feats_ping, nram_points_count, deal_h, deal_v_num - 1); - } - __sync(); - if (deal_v_num > 0) { - // S[deal_v_num - 1] - float *output_addr = - base_voxel_feats_addr + (deal_v_num - 1) * deal_h; - store(output_addr, voxel_feats_ping, deal_h, deal_v_num - 1); - } - } else { - // vi = points_offset + v_iter - lcsFunc(base_voxel_feats_addr, nram_points_count, voxel_feats_ping, - repeat_h, rem_h, deal_h); - } - } - } -#endif -} - -mluOpStatus_t MLUOP_WIN_API KernelDynamicPointToVoxelForward( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpReduceMode_t reduce_mode, const void *feats, int32_t num_points, - int32_t num_feats, void *voxel_coors, void *voxel_num, - void *point2voxel_map, void *voxel_points_count, void *voxel_feats) { - KERNEL_CHECK(MLUKernelDynamicPointToVoxelForward<<>>( - reduce_mode, (float *)feats, num_points, num_feats, - (int32_t *)voxel_coors, (int32_t *)voxel_num, (int *)point2voxel_map, - (int32_t *)voxel_points_count, (float *)voxel_feats)); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_mask_block.mlu b/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_mask_block.mlu deleted file mode 100644 index 63f350b47..000000000 --- a/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_mask_block.mlu +++ /dev/null @@ -1,175 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "dynamic_point_to_voxel_forward.h" - -#include "core/logging.h" -#include "kernels/debug.h" -#include "kernels/kernel.h" -#include "kernels/utils/common.h" - -#define COORS_XYZ 3 - -__nram__ char nram_buffer[MAX_NRAM_SIZE]; - -__mlu_func__ void load(const int32_t *input_addr, int32_t *nram_input, - const int32_t pingpong, const int32_t deal_num, - const int32_t pi) { - int32_t offset = (pi % 2) * pingpong; - int32_t *nram_input_ptr = nram_input + offset; - __memcpy_async(nram_input_ptr, input_addr, deal_num * sizeof(int), - GDRAM2NRAM); -} - -__mlu_func__ void compute(int32_t *coors_ping_in, int32_t *mask_x, - const int32_t pingpong, int32_t deal_num, - const int32_t pi) { - int32_t offset = (pi % 2) * pingpong; - int32_t *coors_ping_in_p = coors_ping_in + offset; - int32_t *coors_ping_out_p = coors_ping_in_p + pingpong / 2; - int32_t N = deal_num / COORS_XYZ; - int32_t *mask_y = mask_x + N; - int32_t *mask_z = mask_x + 2 * N; - __bang_transpose(coors_ping_out_p, coors_ping_in_p, N, 3); - - __bang_int322float((float *)coors_ping_out_p, coors_ping_out_p, deal_num, 0); - __bang_int322float((float *)mask_x, mask_x, deal_num, 0); - - __bang_lt_scalar((float *)mask_x, (float *)coors_ping_out_p, 0, 3 * N); - __bang_float2int32((int32_t *)coors_ping_out_p, (float *)coors_ping_out_p, - deal_num, 0); - __bang_float2int32((int32_t *)mask_x, (float *)mask_x, deal_num, 0); - __bang_add((int *)mask_x, (int *)mask_x, (int *)mask_y, N); - __bang_add((int *)mask_x, (int *)mask_x, (int *)mask_z, N); - __bang_not((int *)mask_x, (int *)mask_x, N); - - __bang_cycle_mul((int *)coors_ping_out_p, (int *)coors_ping_out_p, - (int *)mask_x, deal_num, N); - __bang_add_scalar((int *)mask_x, (int *)mask_x, -1, N); - __bang_cycle_add((int *)coors_ping_out_p, (int *)coors_ping_out_p, - (int *)mask_x, deal_num, N); - __bang_transpose(coors_ping_in_p, coors_ping_out_p, 3, N); -} - -__mlu_func__ void store(int32_t *output_addr, int32_t *nram_output, - const int32_t pingpong, const int32_t deal_num, - const int32_t pi) { - int32_t offset = (pi % 2) * pingpong; - int32_t *nram_output_ptr = nram_output + offset; - __memcpy_async(output_addr, nram_output_ptr, deal_num * sizeof(int), - NRAM2GDRAM); -} - -__mlu_global__ void MLUKernelMaskFillCoorsForward(int32_t num_points, - int32_t *coors) { -#if __BANG_ARCH__ >= 372 - if (__is_mpu()) { - return; - } - int32_t remainder = num_points % taskDim; - int32_t points_per_core = - num_points / taskDim + (int32_t)(taskId < remainder); - // offset of the point32_t that core processes - int32_t points_offset = taskId * (num_points / taskDim) + - (taskId < remainder ? taskId : remainder); - // nram space - - // |coors_ping_in|coors_ping_out|coors_pong_in|coors_pong_out|mask_x|mask_y|mask_z| - int32_t max_deal_num = - FLOOR_ALIGN(MAX_NRAM_SIZE / 5, COORS_XYZ * sizeof(int32_t)) / - sizeof(int32_t); - int32_t coors_num = points_per_core * COORS_XYZ; - int32_t deal_num = max_deal_num > coors_num ? coors_num : max_deal_num; - int32_t repeat_n = coors_num / max_deal_num; - int32_t rem_num = coors_num % max_deal_num; - - int32_t *coors_ping_in = (int32_t *)nram_buffer; - int32_t *mask_x = (int32_t *)coors_ping_in + 4 * deal_num; - int32_t pingpong = 2 * deal_num; - int32_t *base_coors = (int32_t *)coors + points_offset * COORS_XYZ; - - if (repeat_n > 0) { - int32_t *input_addr = base_coors; - load(input_addr, coors_ping_in, pingpong, deal_num, 0); - __sync(); - } - - if (repeat_n > 1) { - // L(vi=1) - int32_t *input_addr = base_coors + deal_num; - load(input_addr, coors_ping_in, pingpong, deal_num, 1); - // C(vi=0) - compute(coors_ping_in, mask_x, pingpong, deal_num, 0); - __sync(); - } - - for (int32_t v_iter = 0; v_iter < repeat_n - 2; v_iter++) { - // S(vi) - int32_t *output_addr = base_coors + v_iter * deal_num; - store(output_addr, coors_ping_in, pingpong, deal_num, v_iter); - // C(vi+1) - compute(coors_ping_in, mask_x, pingpong, deal_num, v_iter + 1); - // L(vi+2) - int32_t *input_addr = base_coors + (v_iter + 2) * deal_num; - load(input_addr, coors_ping_in, pingpong, deal_num, v_iter + 2); - __sync(); - } - - if (repeat_n > 1) { - // S(vi = repeat_n - 2) - int32_t *output_addr = base_coors + (repeat_n - 2) * deal_num; - store(output_addr, coors_ping_in, pingpong, deal_num, repeat_n - 2); - } - if (rem_num > 0) { - // L[repeat_n] - int32_t *input_addr = base_coors + repeat_n * deal_num; - load(input_addr, coors_ping_in, pingpong, rem_num, repeat_n); - } - if (repeat_n > 0) { - // C[repeat_n - 1] - compute(coors_ping_in, mask_x, pingpong, deal_num, repeat_n - 1); - } - __sync(); - if (repeat_n > 0) { - // S[repeat_n - 1] - int32_t *output_addr = base_coors + (repeat_n - 1) * deal_num; - store(output_addr, coors_ping_in, pingpong, deal_num, repeat_n - 1); - } - if (rem_num > 0) { - // C[repeat_n] - compute(coors_ping_in, mask_x, pingpong, rem_num, repeat_n); - __sync(); - // S[repeat_n] - int32_t *output_addr = base_coors + repeat_n * deal_num; - store(output_addr, coors_ping_in, pingpong, rem_num, repeat_n); - } - -#endif -} - -mluOpStatus_t MLUOP_WIN_API -KernelMaskFillCoorsForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, - cnrtQueue_t queue, int32_t num_points, void *coors) { - KERNEL_CHECK(MLUKernelMaskFillCoorsForward<<>>( - num_points, (int32_t *)coors)); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/get_indice_pairs/get_indice_pairs.cpp b/kernels/get_indice_pairs/get_indice_pairs.cpp deleted file mode 100644 index f70489dd2..000000000 --- a/kernels/get_indice_pairs/get_indice_pairs.cpp +++ /dev/null @@ -1,252 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include - -#include "core/context.h" -#include "core/gen_case.h" -#include "core/logging.h" -#include "core/mlu_env.h" -#include "core/tensor.h" -#include "kernels/get_indice_pairs/get_indice_pairs_structs.h" -#include "kernels/get_indice_pairs/normal_get_indice_pairs.h" -#include "mlu_op.h" - -static void getIndicePairsGencase( - mluOpHandle_t handle, - const mluOpSparseConvolutionDescriptor_t sparse_conv_desc, - const mluOpTensorDescriptor_t indices_desc, const void *indices, - const mluOpTensorDescriptor_t indice_pairs_desc, void *indice_pairs, - const mluOpTensorDescriptor_t out_indices_desc, void *out_indices, - const mluOpTensorDescriptor_t indice_num_desc, void *indice_num) { - GEN_CASE_START("get_indice_pairs"); - GEN_CASE_HANDLE(handle); - GEN_CASE_DATA_REAL(true, "indices", indices, indices_desc); - GEN_CASE_DATA_REAL(false, "out_indices", out_indices, out_indices_desc); - GEN_CASE_DATA_REAL(false, "indice_pairs", indice_pairs, indice_pairs_desc); - GEN_CASE_DATA_REAL(false, "indice_num", indice_num, indice_num_desc); - GEN_CASE_OP_PARAM_SINGLE(0, "get_indice_pairs", "dimnb", - sparse_conv_desc->dimNb); - GEN_CASE_OP_PARAM_SINGLE(0, "get_indice_pairs", "batch", - sparse_conv_desc->batch); - GEN_CASE_OP_PARAM_ARRAY(1, "get_indice_pairs", "pad", sparse_conv_desc->pad, - sparse_conv_desc->dimNb == 4 ? 2 : 3); - GEN_CASE_OP_PARAM_ARRAY(1, "get_indice_pairs", "stride", - sparse_conv_desc->stride, - sparse_conv_desc->dimNb == 4 ? 2 : 3); - GEN_CASE_OP_PARAM_ARRAY(1, "get_indice_pairs", "dilation", - sparse_conv_desc->dilation, - sparse_conv_desc->dimNb == 4 ? 2 : 3); - GEN_CASE_OP_PARAM_ARRAY(1, "get_indice_pairs", "input_space", - sparse_conv_desc->input_space, - sparse_conv_desc->dimNb == 4 ? 2 : 3); - GEN_CASE_OP_PARAM_ARRAY(1, "get_indice_pairs", "filter_space", - sparse_conv_desc->filter_space, - sparse_conv_desc->dimNb == 4 ? 2 : 3); - GEN_CASE_OP_PARAM_ARRAY(1, "get_indice_pairs", "output_space", - sparse_conv_desc->output_space, - sparse_conv_desc->dimNb == 4 ? 2 : 3); - GEN_CASE_OP_PARAM_SINGLE(2, "get_indice_pairs", "sub_m", - sparse_conv_desc->sub_m); - GEN_CASE_OP_PARAM_SINGLE(2, "get_indice_pairs", "transpose", - sparse_conv_desc->transpose); - GEN_CASE_OP_PARAM_SINGLE(2, "get_indice_pairs", "inverse", - sparse_conv_desc->inverse); - GEN_CASE_HANDLE_PARAM(); - GEN_CASE_TEST_PARAM_NEW(false, false, true, 0.003, 0.003, 0); -} - -static mluOpStatus_t internalGetIndicePairs( - mluOpHandle_t handle, const std::string interface_name, - mluOpSparseConvolutionDescriptor_t sparse_conv_desc, - const mluOpTensorDescriptor_t indices_desc, const void *indices, - void *workspace, size_t workspace_size, - const mluOpTensorDescriptor_t indice_pairs_desc, void *indice_pairs, - const mluOpTensorDescriptor_t out_indices_desc, void *out_indices, - const mluOpTensorDescriptor_t indice_num_desc, void *indice_num, - const bool is_get_workspace, size_t *return_ws) { - PARAM_CHECK(interface_name, handle != NULL); - PARAM_CHECK(interface_name, sparse_conv_desc != NULL); - PARAM_CHECK(interface_name, indices_desc != NULL); - PARAM_CHECK(interface_name, indice_pairs_desc != NULL); - PARAM_CHECK(interface_name, out_indices_desc != NULL); - PARAM_CHECK(interface_name, indice_num_desc != NULL); - - // check platform - if (handle->arch < 372) { - LOG(ERROR) << interface_name - << " Only mlu300 and above devices are supported." - << " Please check the device version!"; - return MLUOP_STATUS_ARCH_MISMATCH; - } - - // sparse_conv_desc dimNb check - int sparse_conv_dimNb = sparse_conv_desc->dimNb; - - // indices indice_pairs out_indices indice_num - // tensor dim check - PARAM_CHECK(interface_name, indices_desc->dim == 2); - PARAM_CHECK(interface_name, indice_pairs_desc->dim == 3); - PARAM_CHECK(interface_name, out_indices_desc->dim == 2); - PARAM_CHECK(interface_name, indice_num_desc->dim == 1); - PARAM_CHECK(interface_name, indices_desc->dims[1] == 4); - PARAM_CHECK(interface_name, out_indices_desc->dims[1] == 4); - PARAM_CHECK(interface_name, indice_pairs_desc->dims[1] == 2); - - // check shape - PARAM_CHECK(interface_name, - indice_pairs_desc->dims[2] == indices_desc->dims[0]); - PARAM_CHECK(interface_name, - indice_pairs_desc->dims[0] == indice_num_desc->dims[0]); - int kernel_volume = 1; - for (int i = 0; i < sparse_conv_dimNb - 2; i++) { - kernel_volume *= sparse_conv_desc->filter_space[i]; - } - int output_spaces = sparse_conv_desc->batch; - int input_spaces = sparse_conv_desc->batch; - for (int i = 0; i < sparse_conv_dimNb - 2; i++) { - output_spaces *= sparse_conv_desc->output_space[i]; - input_spaces *= sparse_conv_desc->input_space[i]; - } - PARAM_CHECK_LE(interface_name, indices_desc->dims[0], input_spaces); - for (int i = 0; i < sparse_conv_dimNb - 2; i++) { - PARAM_CHECK_GE(interface_name, sparse_conv_desc->pad[i], 0); - PARAM_CHECK_GE(interface_name, sparse_conv_desc->dilation[i], 1); - PARAM_CHECK_GE(interface_name, sparse_conv_desc->stride[i], 1); - if (sparse_conv_desc->dilation[i] != 1 && - sparse_conv_desc->stride[i] != 1) { - return MLUOP_STATUS_BAD_PARAM; - } - } - PARAM_CHECK(interface_name, indice_pairs_desc->dims[0] == kernel_volume); - PARAM_CHECK_LE(interface_name, kernel_volume, 4096); - PARAM_CHECK_LE(interface_name, out_indices_desc->dims[0], output_spaces); - - // large tensor - PARAM_CHECK_LE(interface_name, indices_desc->dims[0], - INDICE_IN_LARGE_TENSOR_NUM); - if (mluOpGetTensorElementNum(indices_desc) >= LARGE_TENSOR_NUM || - mluOpGetTensorElementNum(out_indices_desc) >= LARGE_TENSOR_NUM || - mluOpGetTensorElementNum(indice_pairs_desc) >= LARGE_TENSOR_NUM || - mluOpGetTensorElementNum(indice_num_desc) >= LARGE_TENSOR_NUM) { - LOG(ERROR) << interface_name << " Overflow max tensor num." - << " Currently, MLU-OPS supports tensor num smaller than 2^31."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - - // tensor datatype check - PARAM_CHECK_EQ(interface_name, indices_desc->dtype, MLUOP_DTYPE_INT32); - PARAM_CHECK_EQ(interface_name, indice_pairs_desc->dtype, MLUOP_DTYPE_INT32); - PARAM_CHECK_EQ(interface_name, out_indices_desc->dtype, MLUOP_DTYPE_INT32); - PARAM_CHECK_EQ(interface_name, indice_num_desc->dtype, MLUOP_DTYPE_INT32); - // special check - int sub_m = sparse_conv_desc->sub_m; - if (sub_m) { - for (int i = 0; i < sparse_conv_dimNb - 2; i++) { - PARAM_CHECK_EQ(interface_name, sparse_conv_desc->input_space[i], - sparse_conv_desc->output_space[i]); - PARAM_CHECK_EQ(interface_name, sparse_conv_desc->stride[i], 1); - PARAM_CHECK_EQ(interface_name, sparse_conv_desc->dilation[i], 1); - } - } - - // check zero elment - if (mluOpGetTensorElementNum(indices_desc) == 0 || - mluOpGetTensorElementNum(indice_pairs_desc) == 0 || - mluOpGetTensorElementNum(out_indices_desc) == 0 || - mluOpGetTensorElementNum(indice_num_desc) == 0) { - sparse_conv_desc->num_act_out = 0; - return MLUOP_STATUS_SUCCESS; - } - - // check nullptr - if (!is_get_workspace) { - PARAM_CHECK(interface_name, indices != NULL); - PARAM_CHECK(interface_name, indice_pairs != NULL); - PARAM_CHECK(interface_name, out_indices != NULL); - PARAM_CHECK(interface_name, indice_num != NULL); - if (workspace_size != 0) { - PARAM_CHECK(interface_name, workspace != NULL); - } - } - // gencase - if (!is_get_workspace && MLUOP_GEN_CASE_ON_NEW) { - getIndicePairsGencase(handle, sparse_conv_desc, indices_desc, indices, - indice_pairs_desc, indice_pairs, out_indices_desc, - out_indices, indice_num_desc, indice_num); - } - - // call normal implementaion - mluOpStatus_t return_status; - return_status = normalGetIndicePairs( - handle, interface_name, sparse_conv_desc, indices_desc, indices, - workspace, workspace_size, indice_pairs_desc, indice_pairs, - out_indices_desc, out_indices, indice_num_desc, indice_num, - is_get_workspace, return_ws); - - if (!is_get_workspace) { - GEN_CASE_END(); - } - return return_status; -} - -mluOpStatus_t MLUOP_WIN_API mluOpGetIndicePairs( - mluOpHandle_t handle, mluOpSparseConvolutionDescriptor_t sparse_conv_desc, - const mluOpTensorDescriptor_t indices_desc, const void *indices, - void *workspace, const size_t workspace_size, - const mluOpTensorDescriptor_t indice_pairs_desc, void *indice_pairs, - const mluOpTensorDescriptor_t out_indices_desc, void *out_indices, - const mluOpTensorDescriptor_t indice_num_desc, void *indice_num) { - std::string interface_name = "[mluOpGetIndicesPairs]"; - return internalGetIndicePairs( - handle, interface_name, sparse_conv_desc, indices_desc, indices, - workspace, workspace_size, indice_pairs_desc, indice_pairs, - out_indices_desc, out_indices, indice_num_desc, indice_num, false, NULL); -} - -mluOpStatus_t MLUOP_WIN_API mluOpGetIndicePairsWorkspaceSize( - mluOpHandle_t handle, mluOpSparseConvolutionDescriptor_t sparse_conv_desc, - const mluOpTensorDescriptor_t indices_desc, - const mluOpTensorDescriptor_t indice_pairs_desc, - const mluOpTensorDescriptor_t out_indices_desc, - const mluOpTensorDescriptor_t indice_num_desc, size_t *workspace_size) { - std::string interface_name = "[mluOpGetIndicePairsWorkspaceSize]"; - PARAM_CHECK(interface_name, handle != NULL); - PARAM_CHECK(interface_name, sparse_conv_desc != NULL); - PARAM_CHECK(interface_name, indices_desc != NULL); - PARAM_CHECK(interface_name, indice_pairs_desc != NULL); - PARAM_CHECK(interface_name, out_indices_desc != NULL); - PARAM_CHECK(interface_name, indice_num_desc != NULL); - PARAM_CHECK(interface_name, workspace_size != NULL); - if (mluOpGetTensorElementNum(indices_desc) == 0 || - mluOpGetTensorElementNum(indice_pairs_desc) == 0 || - mluOpGetTensorElementNum(out_indices_desc) == 0 || - mluOpGetTensorElementNum(indice_num_desc) == 0) { - workspace_size[0] = 0; - return MLUOP_STATUS_SUCCESS; - } - - return internalGetIndicePairs(handle, interface_name, sparse_conv_desc, - indices_desc, NULL, NULL, 0, indice_pairs_desc, - NULL, out_indices_desc, NULL, indice_num_desc, - NULL, true, workspace_size); -} diff --git a/kernels/get_indice_pairs/get_indice_pairs_block.mlu b/kernels/get_indice_pairs/get_indice_pairs_block.mlu deleted file mode 100644 index c249b6e7d..000000000 --- a/kernels/get_indice_pairs/get_indice_pairs_block.mlu +++ /dev/null @@ -1,558 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include - -#include "core/logging.h" -#include "kernels/get_indice_pairs/get_indice_pairs_utils.h" -#include "kernels/get_indice_pairs/normal_get_indice_pairs.h" -#include "kernels/kernel.h" - -#define KERNEL_V (4096) -#define NRAM_LIMIT \ - (MAX_NRAM_SIZE + REM_FOR_STACK - 12 * 1024 - 3 * KERNEL_V * sizeof(float)) - -#define Ndim (4) -__nram__ float filter_kd_index[KERNEL_V]; -__nram__ float filter_kh_index[KERNEL_V]; -__nram__ float filter_kw_index[KERNEL_V]; - -__nram__ char nbuf_total[NRAM_LIMIT]; - -__mlu_func__ void computeIndex(int32_t *nram_output, int32_t *nram_input, - int32_t *nram_aux_a, float *nram_aux_b, - OutputSpace output_space, Stride stride, - Dilation dilation, Padding padding, - int32_t deal_num, int32_t step_index_start, - int32_t k_dhw, int32_t batch) { -#if __BANG_ARCH__ >= 370 - int32_t len_l_dim = deal_num * (Ndim + 1); - int32_t deal_num_lk = deal_num * k_dhw; - int32_t output_size = - batch * output_space.o_d * output_space.o_h * output_space.o_w + 1; - __bang_transpose((int32_t *)nram_aux_a, (int32_t *)nram_input, deal_num, - Ndim); - stepIndex((int32_t *)nram_aux_a + deal_num * Ndim, step_index_start, - deal_num); - expandInput((int32_t *)nram_aux_a, len_l_dim, k_dhw); - __bang_transpose((int32_t *)nram_aux_b, (int32_t *)nram_aux_a, k_dhw, - len_l_dim); - __bang_transpose((int32_t *)nram_output + deal_num_lk, - (int32_t *)nram_aux_b + deal_num_lk * Ndim, deal_num, k_dhw); - __bang_int322float_rn((float *)nram_aux_b, (int32_t *)nram_aux_b, - k_dhw * len_l_dim, 0); - computeOutputIndex((float *)nram_aux_b + k_dhw * len_l_dim, - (float *)nram_aux_b, (float *)nram_aux_a, filter_kd_index, - filter_kh_index, filter_kw_index, deal_num_lk, k_dhw, - stride, dilation, padding); - computeMask((float *)nram_aux_a + deal_num_lk * Ndim, - (float *)nram_aux_b + k_dhw * len_l_dim, - (float *)nram_aux_a + deal_num_lk, deal_num_lk, output_space); - __bang_float2int32_tz((int32_t *)nram_aux_a, - (float *)nram_aux_a + deal_num_lk * Ndim, deal_num_lk, - 0); - __bang_transpose((int32_t *)nram_output, (int32_t *)nram_aux_a, deal_num, - k_dhw); - genIndiceOutput((int32_t *)nram_aux_a + deal_num_lk, (float *)nram_aux_b, - (float *)nram_aux_b + k_dhw * len_l_dim, - (int32_t *)nram_aux_a + 2 * deal_num_lk, deal_num_lk, - output_space); - genIndiceOutExpand((int32_t *)nram_aux_a + 2 * deal_num_lk, - (int32_t *)nram_aux_a, (int32_t *)nram_aux_a + deal_num_lk, - (int32_t *)nram_aux_a + 3 * deal_num_lk, deal_num_lk, - output_size); - __bang_transpose((int32_t *)nram_output + 2 * deal_num_lk, - (int32_t *)nram_aux_a + 2 * deal_num_lk, deal_num, k_dhw); -#endif -} - -__mlu_global__ void MLUBlockDefaultGetIndicePairKernel1( - void *mask_all_ws, void *indice_index_in_ws, void *indice_out_expand_ws, - void *indices_in, FilterSpace host_filter_space, - InputSpace host_input_space, OutputSpace host_output_space, - Stride host_stride, Dilation host_dilation, Padding host_padding, - int32_t core_num_l, int32_t input_active_site, int32_t batch) { -#if __BANG_ARCH__ >= 370 - /* nram_space - |input| mask_all | indice_index_in | indice_out_expand | 4l + 3 k l - |input| mask_all | indice_index_in | indice_out_expand | 4l + 3 k l - | nram_aux_a 5 l k | nram_aux_b 8 l k - */ - FilterSpace filter_space = host_filter_space; - // InputSpace input_space = host_input_space; - OutputSpace output_space = host_output_space; - Stride stride = host_stride; - Dilation dilation = host_dilation; - Padding padding = host_padding; - int32_t k_d = filter_space.k_d, k_h = filter_space.k_h, - k_w = filter_space.k_w; - int32_t k_dhw = k_d * k_h * k_w; - genFilterIndex(filter_kd_index, filter_kh_index, filter_kw_index, k_d, k_h, - k_w); - int32_t len_l_job = 0, offset_l_job = 0; - assignTask(input_active_site, taskIdY, taskDimY, offset_l_job, len_l_job); - int32_t repeat = (len_l_job + core_num_l - 1) / core_num_l; - int32_t rem_num_l = - len_l_job % core_num_l == 0 ? core_num_l : len_l_job % core_num_l; - int32_t *nram_input = (int32_t *)nbuf_total; - int32_t load_num = core_num_l * Ndim; - float *nram_output = (float *)nbuf_total + load_num; - int32_t len_l_k = core_num_l * k_dhw; - int32_t ping_pong_num = load_num + len_l_k * 3; - float *nram_aux_a = (float *)nbuf_total + 2 * ping_pong_num; - float *nram_aux_b = (float *)nram_aux_a + len_l_k * (Ndim + 1); - int step_index_start = offset_l_job; - for (int i = 0; i < repeat + 2; ++i) { - if (i < repeat) { - int32_t *indices_in_addr = - (int32_t *)indices_in + (offset_l_job + i * core_num_l) * Ndim; - int32_t *nram_input_t = (int32_t *)nram_input + (i % 2) * ping_pong_num; - int32_t deal_num = i == repeat - 1 ? rem_num_l : core_num_l; - __memcpy_async((char *)nram_input_t, (char *)indices_in_addr, - deal_num * Ndim * sizeof(int), GDRAM2NRAM); - } - if (1 <= i && i < (repeat + 1)) { - int32_t deal_num = (i - 1) == repeat - 1 ? rem_num_l : core_num_l; - int32_t *nram_input_t = - (int32_t *)nram_input + ((i - 1) % 2) * ping_pong_num; - int32_t *nram_output_t = - (int32_t *)nram_output + ((i - 1) % 2) * ping_pong_num; - computeIndex(nram_output_t, nram_input_t, (int32_t *)nram_aux_a, - nram_aux_b, output_space, stride, dilation, padding, - deal_num, step_index_start, k_dhw, batch); - step_index_start += deal_num; - } - if (i >= 2) { - int32_t deal_num = (i - 2) == repeat - 1 ? rem_num_l : core_num_l; - uint64_t gdram_offset = - (offset_l_job + (i - 2) * core_num_l) * sizeof(int); - int32_t *nram_output_t = - (int32_t *)nram_output + ((i - 2) % 2) * ping_pong_num; - __memcpy_async((char *)mask_all_ws + gdram_offset, - (char *)(nram_output_t), deal_num * sizeof(int), - NRAM2GDRAM, input_active_site * sizeof(int), - deal_num * sizeof(int), k_dhw - 1); - __memcpy_async((char *)indice_index_in_ws + gdram_offset, - (char *)(nram_output_t + deal_num * k_dhw), - deal_num * sizeof(int), NRAM2GDRAM, - input_active_site * sizeof(int), deal_num * sizeof(int), - k_dhw - 1); - __memcpy_async((char *)indice_out_expand_ws + gdram_offset, - (char *)(nram_output_t + 2 * deal_num * k_dhw), - deal_num * sizeof(int), NRAM2GDRAM, - input_active_site * sizeof(int), deal_num * sizeof(int), - k_dhw - 1); - } - __sync(); - } -#endif -} - -__mlu_global__ void MLUBlockDefaultGetIndicePairKernel2(void *index_output_ptr, - int32_t num_act_out, - int32_t core_num_l) { -#if __BANG_ARCH__ >= 370 - int32_t len_job = 0, offset_job = 0; - assignTask(num_act_out, taskIdY, taskDimY, offset_job, len_job); - int32_t repeat = (len_job + core_num_l - 1) / core_num_l; - int32_t rem_num_l = - len_job % core_num_l == 0 ? core_num_l : len_job % core_num_l; - int32_t *nram_input = (int32_t *)nbuf_total; - for (int i = 0; i < repeat; ++i) { - int32_t start_index = offset_job + i * core_num_l; - int32_t length = i == (repeat - 1) ? rem_num_l : core_num_l; - stepIndex((int32_t *)nram_input, start_index, length); // sync - int32_t *output_ptr = (int32_t *)index_output_ptr + start_index; - __memcpy((char *)output_ptr, nram_input, length * sizeof(int), NRAM2GDRAM); - } -#endif -} - -__mlu_global__ void MLUBlockBalanceGetIndicePairKernel( - void *balance_input, void *balance_mask, void *balance_output, - int32_t len_l, int32_t kernel_volume, int32_t core_num_l, - int32_t output_size) { -#if __BANG_ARCH__ >= 370 - int32_t len_job, offset_job = 0; - assignTask(len_l * kernel_volume, taskIdY, taskDimY, offset_job, len_job); - int32_t repeat = (len_job + core_num_l - 1) / core_num_l; - int32_t rem_num_l = - len_job % core_num_l == 0 ? core_num_l : len_job % core_num_l; - int32_t *nram_random_num = (int32_t *)nbuf_total; - int32_t *nram_input = (int32_t *)nbuf_total + core_num_l; - int32_t *nram_mask = (int32_t *)nbuf_total + 2 * core_num_l; - int32_t *nram_output = (int32_t *)nbuf_total + 3 * core_num_l; - int32_t ping_pong_num = 3 * core_num_l; - int32_t *nram_aux = (int32_t *)nbuf_total + 7 * core_num_l; - int32_t multi_max = output_size / taskDimY; - stepIndex(nram_random_num, taskId * multi_max, core_num_l); - for (int i = 0; i < repeat + 2; ++i) { - if (i < repeat) { - int32_t deal_num = i == repeat - 1 ? rem_num_l : core_num_l; - int32_t *balance_input_ptr = - (int32_t *)balance_input + offset_job + i * core_num_l; - int32_t *balance_mask_ptr = - (int32_t *)balance_mask + offset_job + i * core_num_l; - int32_t *nram_input_t = (int32_t *)nram_input + (i % 2) * ping_pong_num; - int32_t *nram_mask_t = (int32_t *)nram_mask + (i % 2) * ping_pong_num; - __memcpy_async((char *)nram_input_t, (char *)balance_input_ptr, - deal_num * sizeof(int), GDRAM2NRAM); - __memcpy_async((char *)nram_mask_t, (char *)balance_mask_ptr, - deal_num * sizeof(int), GDRAM2NRAM); - } - if (1 <= i && i <= repeat) { - int32_t deal_num = (i - 1) == repeat - 1 ? rem_num_l : core_num_l; - int32_t *nram_input_t = - (int32_t *)nram_input + ((i - 1) % 2) * ping_pong_num; - int32_t *nram_mask_t = - (int32_t *)nram_mask + ((i - 1) % 2) * ping_pong_num; - int32_t *nram_output_t = - (int32_t *)nram_output + ((i - 1) % 2) * ping_pong_num; - __bang_mul_scalar((int32_t *)nram_aux, (int32_t *)nram_mask_t, int(-1), - deal_num); - __bang_band((char *)nram_output_t, (char *)nram_input_t, (char *)nram_aux, - deal_num * sizeof(int)); - __bang_sub_scalar((int32_t *)nram_aux, (int32_t *)nram_mask_t, int(1), - deal_num); - __bang_band((char *)nram_aux, (char *)nram_aux, (char *)nram_random_num, - deal_num * sizeof(int)); - __bang_add((int32_t *)nram_output_t, (int32_t *)nram_output_t, - (int32_t *)nram_aux, deal_num); - } - if (i >= 2) { - int32_t deal_num = (i - 2) == repeat - 1 ? rem_num_l : core_num_l; - uint64_t gdram_offset = (offset_job + (i - 2) * core_num_l) * sizeof(int); - int32_t *nram_output_t = - (int32_t *)nram_output + ((i - 2) % 2) * ping_pong_num; - __memcpy_async((char *)balance_output + gdram_offset, - (char *)nram_output_t, deal_num * sizeof(int), NRAM2GDRAM); - } - __sync(); - } -#endif -} - -__mlu_global__ void MLUBlockDefaultGetIndicePairKernel3( - void *indice_pair, void *indice_index_ptr, void *mask_all, int32_t len_l, - int32_t kernel_volume, int32_t core_num_l) { -#if __BANG_ARCH__ >= 370 - int32_t len_l_job = 0, offset_l_job = 0; - assignTask(2 * kernel_volume, taskIdY, taskDimY, offset_l_job, len_l_job); - float *nram_input = (float *)nbuf_total; - float *nram_mask = (float *)nram_input + core_num_l; - float *nram_output = (float *)nram_input + core_num_l * 2; - float *nram_aux = (float *)nram_input + core_num_l * 3; - // | nram_input | nram_mask | nram_output | nram_aux | - for (int j = 0; j < len_l_job; ++j) { - int32_t mask_offset = (offset_l_job + j) % kernel_volume; - int32_t indice_store = ((offset_l_job + j) % kernel_volume) * 2; - int32_t store_offset = - (offset_l_job + j) < kernel_volume ? indice_store : indice_store + 1; - int32_t *index_job_start = - (int32_t *)indice_index_ptr + (offset_l_job + j) * len_l; - int32_t *mask_job_start = (int32_t *)mask_all + mask_offset * len_l; - int32_t core_offset_l_valid = 0; - int32_t valid_l_num_now = 0; - int32_t repeat = (len_l + core_num_l - 1) / core_num_l; - int32_t rem_num_l = - len_l % core_num_l == 0 ? core_num_l : len_l % core_num_l; - for (int i = 0; i < repeat; ++i) { - int32_t load_l_num = i == (repeat - 1) ? rem_num_l : core_num_l; - int32_t *index_start = (int32_t *)index_job_start + i * core_num_l; - int32_t *mask_start = (int32_t *)mask_job_start + i * core_num_l; - __memcpy(nram_input, index_start, load_l_num * sizeof(int), GDRAM2NRAM); - __memcpy(nram_mask, mask_start, load_l_num * sizeof(int), GDRAM2NRAM); - __bang_int322float_rn((float *)nram_aux, (int32_t *)nram_mask, load_l_num, - 0); - valid_l_num_now = __bang_count((float *)nram_aux, load_l_num); - __bang_collect((float *)nram_output, (float *)nram_input, - (float *)nram_aux, load_l_num); - int32_t *store_valid_ptr = - (int32_t *)indice_pair + store_offset * len_l + core_offset_l_valid; - core_offset_l_valid += valid_l_num_now; - if (valid_l_num_now > 0) { - __memcpy((char *)store_valid_ptr, (char *)nram_output, - valid_l_num_now * sizeof(int32_t), NRAM2GDRAM); - } - } - } -#endif -} - -__mlu_global__ void MLUBlockDefaultGetIndicePairKernel4( - void *indice_out, void *input_ptr, OutputSpace host_output_space, - int32_t len_l, int32_t core_num_l) { -#if __BANG_ARCH__ >= 370 - OutputSpace output_space = host_output_space; - int32_t len_l_job = 0, offset_l_job = 0; - assignTask(len_l, taskIdY, taskDimY, offset_l_job, len_l_job); - int32_t ping_pong_num = core_num_l * 5; - int32_t *nram_input = (int32_t *)nbuf_total; - int32_t *nram_output = (int32_t *)nbuf_total + core_num_l; - int32_t *nram_aux = (int32_t *)nbuf_total + 2 * ping_pong_num; - int32_t *input_start_core = (int32_t *)input_ptr + offset_l_job; - // |nram_input | nram_output * 4 | nram_input | nram_output * 4 | nram_aux| - int32_t rem_num_l = - len_l_job % core_num_l == 0 ? core_num_l : len_l_job % core_num_l; - int32_t repeat = (len_l_job + core_num_l - 1) / core_num_l; - for (int i = 0; i < repeat + 2; ++i) { - if (i < repeat) { - int32_t load_num_l = i == (repeat - 1) ? rem_num_l : core_num_l; - int32_t *input_start_ptr = input_start_core + i * core_num_l; - int32_t *nram_input_load = nram_input + (i % 2) * ping_pong_num; - __memcpy_async((char *)nram_input_load, (char *)input_start_ptr, - load_num_l * sizeof(int32_t), GDRAM2NRAM); - } - if (1 <= i && i < (repeat + 1)) { - int32_t load_num_l = (i - 1) == (repeat - 1) ? rem_num_l : core_num_l; - int32_t *nram_output_t = nram_output + ((i - 1) % 2) * ping_pong_num; - int32_t *nram_input_t = nram_input + ((i - 1) % 2) * ping_pong_num; - genIndiceOutLast((int32_t *)nram_output_t, (int32_t *)nram_input_t, - (int32_t *)nram_aux, output_space, load_num_l); - } - if (i >= 2) { - int32_t load_num_l = (i - 2) == (repeat - 1) ? rem_num_l : core_num_l; - int32_t *nram_output_t = nram_output + ((i - 2) % 2) * ping_pong_num; - int32_t *indice_out_t = - (int32_t *)indice_out + (offset_l_job + (i - 2) * core_num_l) * 4; - __memcpy_async((char *)indice_out_t, (char *)nram_output_t, - load_num_l * 4 * sizeof(int32_t), NRAM2GDRAM); - } - __sync(); - } -#endif -} - -__mlu_global__ void MLUBlockSubmGetIndicePairKernel1( - void *mask_all_ptr, void *indice_index_in_ptr, void *indice_in_expand_ptr, - void *indice_out_expand_ptr, void *indices_in, - FilterSpace host_filter_space, InputSpace host_input_space, - OutputSpace host_output_space, Stride host_stride, Dilation host_dilation, - Padding host_padding, int32_t core_num_l, int32_t input_active_site, - int32_t batch) { -#if __BANG_ARCH__ >= 370 - /* nram_space - |input| mask_all | indice_index_in | indice_out_expand | indice_in_expand | - 4l + l + 3kl |input| mask_all | indice_index_in | indice_out_expand | - indice_in_expand | 4l + l + 3kl | nram_aux_a 5lk | nram_aux_b 8lk | - */ - FilterSpace filter_space = host_filter_space; - InputSpace input_space = host_input_space; - OutputSpace output_space = host_output_space; - Stride stride = host_stride; - Dilation dilation = host_dilation; - Padding padding = host_padding; - int32_t k_d = filter_space.k_d, k_h = filter_space.k_h, - k_w = filter_space.k_w; - int32_t k_dhw = k_d * k_h * k_w; - genFilterIndex((float *)filter_kd_index, (float *)filter_kh_index, - (float *)filter_kw_index, k_d, k_h, k_w); - int32_t len_l_job = 0, offset_l_job = 0; - assignTask(input_active_site, taskIdY, taskDimY, offset_l_job, len_l_job); - int32_t repeat = (len_l_job + core_num_l - 1) / core_num_l; - int32_t rem_num_l = - len_l_job % core_num_l == 0 ? core_num_l : len_l_job % core_num_l; - int32_t *nram_input = (int32_t *)nbuf_total; - int32_t load_num = core_num_l * Ndim; - float *nram_output = (float *)nbuf_total + load_num; - int32_t len_l_k = core_num_l * k_dhw; - int32_t ping_pong_num = load_num + core_num_l + len_l_k * 3; - float *nram_aux_a = (float *)nbuf_total + 2 * ping_pong_num; - float *nram_aux_b = (float *)nram_aux_a + len_l_k * (Ndim + 1); - int step_index_start = offset_l_job; - for (int i = 0; i < repeat + 2; ++i) { - if (i < repeat) { - float *indices_in_addr = - (float *)indices_in + (offset_l_job + i * core_num_l) * Ndim; - int32_t *nram_input_t = (int32_t *)nram_input + (i % 2) * ping_pong_num; - int32_t deal_num = i == repeat - 1 ? rem_num_l : core_num_l; - __memcpy_async((char *)nram_input_t, (char *)indices_in_addr, - deal_num * Ndim * sizeof(int), GDRAM2NRAM); - } - if (1 <= i && i < (repeat + 1)) { - int32_t deal_num = (i - 1) == repeat - 1 ? rem_num_l : core_num_l; - int32_t *nram_input_t = - (int32_t *)nram_input + ((i - 1) % 2) * ping_pong_num; - int32_t *nram_output_t = - (int32_t *)nram_output + ((i - 1) % 2) * ping_pong_num; - genIndiceInExpand(nram_output_t + 3 * deal_num * k_dhw, nram_input_t, - (int32_t *)nram_aux_a, deal_num, input_space); - computeIndex(nram_output_t, nram_input_t, (int32_t *)nram_aux_a, - nram_aux_b, output_space, stride, dilation, padding, - deal_num, step_index_start, k_dhw, batch); - step_index_start += deal_num; - } - if (i >= 2) { - int32_t deal_num = (i - 2) == repeat - 1 ? rem_num_l : core_num_l; - uint64_t gdram_offset = - (offset_l_job + (i - 2) * core_num_l) * sizeof(int32_t); - int32_t *nram_output_t = - (int32_t *)nram_output + ((i - 2) % 2) * ping_pong_num; - __memcpy_async((char *)mask_all_ptr + gdram_offset, - (char *)(nram_output_t), deal_num * sizeof(int), - NRAM2GDRAM, input_active_site * sizeof(int), - deal_num * sizeof(int32_t), k_dhw - 1); - __memcpy_async((char *)indice_index_in_ptr + gdram_offset, - (char *)(nram_output_t + deal_num * k_dhw), - deal_num * sizeof(int), NRAM2GDRAM, - input_active_site * sizeof(int), - deal_num * sizeof(int32_t), k_dhw - 1); - __memcpy_async((char *)indice_out_expand_ptr + gdram_offset, - (char *)(nram_output_t + 2 * deal_num * k_dhw), - deal_num * sizeof(int), NRAM2GDRAM, - input_active_site * sizeof(int), - deal_num * sizeof(int32_t), k_dhw - 1); - __memcpy_async((char *)indice_in_expand_ptr + gdram_offset, - (char *)(nram_output_t + 3 * deal_num * k_dhw), - deal_num * sizeof(int), NRAM2GDRAM); - } - __sync(); - } -#endif -} - -__mlu_global__ void MLUBlockSubmGetIndicePairKernel2( - void *indice_out, void *mask_all_ptr, void *indice_out_index_ptr, - void *indices_in, int32_t len_1_one, int32_t len_l_two, - int32_t core_num_1_one, int32_t core_num_l_two) { -#if __BANG_ARCH__ >= 370 - int32_t len_job = 0, offset_job = 0; - assignTask(len_1_one, taskIdY, taskDimY, offset_job, len_job); - int32_t repeat = (len_job + core_num_1_one - 1) / core_num_1_one; - int32_t rem_num_l = - len_job % core_num_1_one == 0 ? core_num_1_one : len_job % core_num_1_one; - int32_t *nram_input = (int32_t *)nbuf_total; - int32_t bit_width = sizeof(int32_t); - int32_t *indices_in_offset = (int32_t *)indices_in + offset_job; - int32_t *indice_out_offset = (int32_t *)indice_out + offset_job; - for (int i = 0; i < repeat; ++i) { - int32_t offset = i * core_num_1_one; - int32_t deal_num = i == repeat - 1 ? rem_num_l : core_num_1_one; - __memcpy_async((char *)nram_input, (char *)(indices_in_offset + offset), - deal_num * bit_width, GDRAM2NRAM); - __memcpy_async((char *)(indice_out_offset + offset), (char *)nram_input, - deal_num * bit_width, NRAM2GDRAM); - } - - assignTask(len_l_two, taskIdY, taskDimY, offset_job, len_job); - repeat = (len_job + core_num_l_two - 1) / core_num_l_two; - rem_num_l = - len_job % core_num_l_two == 0 ? core_num_l_two : len_job % core_num_l_two; - int32_t *mask_all_ptr_offset = (int32_t *)mask_all_ptr + offset_job; - int32_t *indice_out_index_ptr_offset = - (int32_t *)indice_out_index_ptr + offset_job; - int32_t *nram_output = (int32_t *)nbuf_total + core_num_l_two; - for (int i = 0; i < repeat; ++i) { - int32_t offset = i * core_num_l_two; - int32_t deal_num = i == repeat - 1 ? rem_num_l : core_num_l_two; - __memcpy((char *)nram_input, (char *)(mask_all_ptr_offset + offset), - deal_num * bit_width, GDRAM2NRAM); - __memcpy((char *)nram_output, - (char *)(indice_out_index_ptr_offset + offset), - deal_num * bit_width, GDRAM2NRAM); - __bang_ge_scalar((int32_t *)nram_output, (int32_t *)nram_output, (int)0, - deal_num); - __bang_and((int32_t *)nram_output, (int32_t *)nram_output, - (int32_t *)nram_input, deal_num); - __memcpy((char *)(mask_all_ptr_offset + offset), (char *)nram_output, - deal_num * bit_width, NRAM2GDRAM); - } -#endif -} - -mluOpStatus_t MLUOP_WIN_API KernelDefaultGetIndicePairKl1( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - void *mask_all_ws, void *indice_index_in_ws, void *out_indices_expand_ws, - void *indices, FilterSpace filter_space, InputSpace input_space, - OutputSpace output_space, Stride stride, Dilation dilation, Padding padding, - int32_t core_num_l, int32_t input_active_site, int32_t batch) { - KERNEL_CHECK(MLUBlockDefaultGetIndicePairKernel1<<>>( - (void *)mask_all_ws, (void *)indice_index_in_ws, - (void *)out_indices_expand_ws, (void *)indices, filter_space, input_space, - output_space, stride, dilation, padding, core_num_l, input_active_site, - batch)); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API KernelDefaultGetIndicePairKl2( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - void *step_index_ptr, int32_t num_act_out, int32_t core_num_l) { - KERNEL_CHECK(MLUBlockDefaultGetIndicePairKernel2<<>>( - step_index_ptr, num_act_out, core_num_l)); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API KernelDefaultGetIndicePairKl3( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - void *indice_pairs, void *input_addr, void *mask_addr, - int32_t input_active_site, int32_t kernel_volume, int32_t core_num_l) { - KERNEL_CHECK(MLUBlockDefaultGetIndicePairKernel3<<>>( - indice_pairs, input_addr, mask_addr, input_active_site, kernel_volume, - core_num_l)); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API KernelDefaultGetIndicePairKl4( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - void *out_indices, void *input_addr, OutputSpace output_space, - int32_t len_l, int32_t core_num_l) { - KERNEL_CHECK(MLUBlockDefaultGetIndicePairKernel4<<>>( - out_indices, input_addr, output_space, len_l, core_num_l)); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API KernelBalanceGetIndicePair( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - void *balance_input, void *balance_mask, void *balance_output, - int32_t len_l, int32_t kernel_volume, int32_t core_num_l, - int32_t output_size) { - KERNEL_CHECK(MLUBlockBalanceGetIndicePairKernel<<>>( - balance_input, balance_mask, balance_output, len_l, kernel_volume, - core_num_l, output_size)); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API KernelSubmGetIndicePairKl1( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - void *mask_all_ptr, void *indice_index_in_ptr, void *indice_in_expand_ptr, - void *out_indices_expand_ptr, void *indices, FilterSpace filter_space, - InputSpace input_space, OutputSpace output_space, Stride stride, - Dilation dilation, Padding padding, int32_t core_num_l, - int32_t input_active_site, int32_t batch) { - KERNEL_CHECK(MLUBlockSubmGetIndicePairKernel1<<>>( - (void *)mask_all_ptr, (void *)indice_index_in_ptr, - (void *)indice_in_expand_ptr, (void *)out_indices_expand_ptr, - (void *)indices, filter_space, input_space, output_space, stride, - dilation, padding, core_num_l, input_active_site, batch)); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API KernelSubmGetIndicePairKl2( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - void *out_indices, void *mask_all_ptr, void *out_indices_index_ptr, - void *indices, int32_t len_1_one, int32_t len_l_two, int32_t core_num_l_one, - int32_t core_num_l_two) { - KERNEL_CHECK(MLUBlockSubmGetIndicePairKernel2<<>>( - (void *)out_indices, (void *)mask_all_ptr, (void *)out_indices_index_ptr, - (void *)indices, len_1_one, len_l_two, core_num_l_one, core_num_l_two)); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/get_indice_pairs/get_indice_pairs_structs.cpp b/kernels/get_indice_pairs/get_indice_pairs_structs.cpp deleted file mode 100644 index 44b00a55b..000000000 --- a/kernels/get_indice_pairs/get_indice_pairs_structs.cpp +++ /dev/null @@ -1,133 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include -#include - -#include "core/logging.h" -#include "core/type.h" -#include "kernels/get_indice_pairs/get_indice_pairs_structs.h" -#include "mlu_op.h" - -mluOpStatus_t MLUOP_WIN_API mluOpCreateSparseConvolutionDescriptor( - mluOpSparseConvolutionDescriptor_t *desc) { - if (desc == NULL) { - LOG(ERROR) << "mluOpCreateSparseConvolutionDescriptor failed, " - << "can't create desc when desc == NULL."; - return MLUOP_STATUS_NOT_INITIALIZED; - } - mluOpSparseConvolutionStruct *ts = - new (std::nothrow) mluOpSparseConvolutionStruct(); - *desc = ts; - return MLUOP_STATUS_SUCCESS; -} - -/* set sparse convolution descriptor. - * pad_dim_num = input_dim_num - 2, and each dim need two pad value. - */ -mluOpStatus_t MLUOP_WIN_API mluOpSetSparseConvolutionDescriptor( - mluOpSparseConvolutionDescriptor_t sparse_conv_desc, int dimNb, int batch, - const int pad[], const int stride[], const int dilation[], - const int input_space[], const int filter_space[], const int output_space[], - const int sub_m, const int transpose, const int inverse) { - std::string interface_name = "[mluOpSetSparseConvolutionDescriptor]"; - PARAM_CHECK(interface_name, sparse_conv_desc != NULL); - PARAM_CHECK(interface_name, pad != NULL); - PARAM_CHECK(interface_name, stride != NULL); - PARAM_CHECK(interface_name, dilation != NULL); - PARAM_CHECK(interface_name, input_space != NULL); - PARAM_CHECK(interface_name, filter_space != NULL); - PARAM_CHECK(interface_name, output_space != NULL); - if (dimNb != 5) { - LOG(ERROR) << interface_name << " only " - << "support 3D_conv, dimnb should be 5. now dimNb is " << dimNb - << "."; - return MLUOP_STATUS_BAD_PARAM; - } - sparse_conv_desc->dimNb = dimNb; - - if (batch <= 0) { - LOG(ERROR) << interface_name << " only " - << "support postive batch. now batch is " << batch << "."; - return MLUOP_STATUS_BAD_PARAM; - } - sparse_conv_desc->batch = batch; - - sparse_conv_desc->sub_m = sub_m; - - if (transpose != 0) { - LOG(ERROR) << interface_name << " : not " - << "support transpose . now transpose is " << transpose << "."; - return MLUOP_STATUS_BAD_PARAM; - } - sparse_conv_desc->transpose = transpose; - - if (inverse != 0) { - LOG(ERROR) << interface_name << " : not " - << "support inverse. now inverse is " << inverse << "."; - return MLUOP_STATUS_BAD_PARAM; - } - sparse_conv_desc->inverse = inverse; - - int kernel_dim = dimNb - 2; - for (int idx = 0; idx < kernel_dim; idx++) { - PARAM_CHECK_GE(interface_name, pad[idx], 0); - sparse_conv_desc->pad[idx] = pad[idx]; - PARAM_CHECK_GE(interface_name, stride[idx], 1); - sparse_conv_desc->stride[idx] = stride[idx]; - PARAM_CHECK_GE(interface_name, dilation[idx], 1); - sparse_conv_desc->dilation[idx] = dilation[idx]; - PARAM_CHECK_GE(interface_name, input_space[idx], 1); - sparse_conv_desc->input_space[idx] = input_space[idx]; - PARAM_CHECK_GE(interface_name, filter_space[idx], 1); - sparse_conv_desc->filter_space[idx] = filter_space[idx]; - PARAM_CHECK_GE(interface_name, output_space[idx], 1); - sparse_conv_desc->output_space[idx] = output_space[idx]; - } - - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpGetSparseConvolutionNumActOut( - mluOpSparseConvolutionDescriptor_t desc, int *num_act_out) { - if (desc == NULL || num_act_out == NULL) { - LOG(ERROR) << "mluOpCreateSparseConvolutionDescriptor or " - << "num_act_out failed " - << " Passing NULL ptr to this API."; - return MLUOP_STATUS_NOT_INITIALIZED; - } - int size = 0; - size = desc->num_act_out; - num_act_out[0] = size; - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpDestroySparseConvolutionDescriptor( - mluOpSparseConvolutionDescriptor_t desc) { - if (desc == NULL) { - LOG(ERROR) << "mluOpDestroySparseConvolutionDescriptor fail. Passing NULL " - "ptr to this API."; - return MLUOP_STATUS_EXECUTION_FAILED; - } - delete desc; - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/get_indice_pairs/get_indice_pairs_structs.h b/kernels/get_indice_pairs/get_indice_pairs_structs.h deleted file mode 100644 index 083c33947..000000000 --- a/kernels/get_indice_pairs/get_indice_pairs_structs.h +++ /dev/null @@ -1,100 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ - -#ifndef KERNELS_GET_INDICE_PAIRS_GET_INDICE_PAIRS_STRUCTS_H_ -#define KERNELS_GET_INDICE_PAIRS_GET_INDICE_PAIRS_STRUCTS_H_ - -#include "mlu_op.h" - -#define MAX_PAD_DIM 6 -#define MAX_STRIDE_DIM 6 -#define MAX_DILATION_DIM 6 -#define MAX_INPUT_DIM 3 -#define MAX_FILTER_DIM 3 -#define MAX_OUTPUT_DIM 3 - -#define INDICE_IN_LARGE_TENSOR_NUM 1000000 - -struct FilterSpace { - int k_d; - int k_h; - int k_w; - FilterSpace(const int &k_d_, const int &k_h_, const int &k_w_) - : k_d(k_d_), k_h(k_h_), k_w(k_w_) {} -}; -struct InputSpace { - int i_d; - int i_h; - int i_w; - InputSpace(const int &i_d_, const int &i_h_, const int &i_w_) - : i_d(i_d_), i_h(i_h_), i_w(i_w_) {} -}; - -struct OutputSpace { - int o_d; - int o_h; - int o_w; - OutputSpace(const int &o_d_, const int &o_h_, const int &o_w_) - : o_d(o_d_), o_h(o_h_), o_w(o_w_) {} -}; - -struct Stride { - int s_d; - int s_h; - int s_w; - Stride(const int &s_d_, const int &s_h_, const int &s_w_) - : s_d(s_d_), s_h(s_h_), s_w(s_w_) {} -}; - -struct Dilation { - int d_d; - int d_h; - int d_w; - Dilation(const int &d_d_, const int &d_h_, const int &d_w_) - : d_d(d_d_), d_h(d_h_), d_w(d_w_) {} -}; - -struct Padding { - int p_d; - int p_h; - int p_w; - Padding(const int &p_d_, const int &p_h_, const int &p_w_) - : p_d(p_d_), p_h(p_h_), p_w(p_w_) {} -}; - -struct mluOpSparseConvolutionStruct { - int dimNb; - int batch; - int pad[MAX_PAD_DIM]; - int stride[MAX_STRIDE_DIM]; - int dilation[MAX_DILATION_DIM]; - int input_space[MAX_INPUT_DIM]; - int filter_space[MAX_FILTER_DIM]; - int output_space[MAX_OUTPUT_DIM]; - int sub_m = 0; - int transpose = 0; - int inverse = 0; - int num_act_out = 0; -}; - -#endif // KERNELS_GET_INDICE_PAIRS_GET_INDICE_PAIRS_STRUCTS_H_ diff --git a/kernels/get_indice_pairs/get_indice_pairs_utils.h b/kernels/get_indice_pairs/get_indice_pairs_utils.h deleted file mode 100644 index a4d0bdd45..000000000 --- a/kernels/get_indice_pairs/get_indice_pairs_utils.h +++ /dev/null @@ -1,348 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ - -#ifndef KERNELS_GET_INDICE_PAIRS_GET_INDICE_PAIRS_UTILS_H_ -#define KERNELS_GET_INDICE_PAIRS_GET_INDICE_PAIRS_UTILS_H_ - -#include - -#include "kernels/get_indice_pairs/normal_get_indice_pairs.h" -#include "kernels/kernel.h" - -#if __BANG_ARCH__ >= 370 -__mlu_func__ void assignTask(const int32_t num_total_task, - const int32_t &taskid, const int32_t &taskdim, - int32_t &task_offset, int32_t &num_cur_task) { - int32_t num_per_task = num_total_task / taskdim; - int32_t rem_idx = num_total_task % taskdim; - if (taskid < rem_idx) { - task_offset = taskid * (num_per_task + 1); - num_cur_task = num_per_task + 1; - } else { - task_offset = taskid * num_per_task + rem_idx; - num_cur_task = num_per_task; - } -} - -/* -func: init filter index kd * kh * kw -*/ -__mlu_func__ void genFilterIndex(float *filter_kd_index, float *filter_kh_index, - float *filter_kw_index, int32_t k_d, - int32_t k_h, int32_t k_w) { - // kw kh, kd loop init - int32_t kdhw = k_d * k_h * k_w, khw = k_w * k_h, index_kd_count = 0, - index_kh_count = 0; - float index_kd = 0, index_kh = 0, index_kw = 0; - for (int i = 0; i < kdhw; ++i) { - filter_kw_index[i] = index_kw; - index_kw++; - if (index_kw >= k_w) index_kw = 0.0; - } - for (int i = 0; i < kdhw; ++i) { - filter_kh_index[i] = index_kh; - index_kh_count++; - if (index_kh_count % k_w == 0) index_kh++; - if (index_kh_count % khw == 0) index_kh = 0.0; - } - for (int i = 0; i < kdhw; ++i) { - filter_kd_index[i] = index_kd; - index_kd_count++; - if (index_kd_count % khw == 0) index_kd++; - } -} - -/* -func: generate stage index from start_index -*/ -__mlu_func__ void stepIndex(int32_t *dst_nram, int32_t start_index, - int32_t length) { -#if (__BANG_ARCH__ == 372 || __BANG_ARCH__ == 322 || __BANG_ARCH__ == 592) - int32_t align_num = 128; - int32_t repeat = (int32_t)(logf(length / align_num) / logf(2)); - int32_t remain = length / align_num - powf(2, repeat); - int32_t global_remain = length % align_num; - int32_t count = 1; - for (int32_t i = 0; i < align_num; ++i) { - dst_nram[i] = i + start_index; - if (i == length - 1) { - return; - } - } - for (int i = 0; i < repeat; ++i) { - __bang_add_scalar((int32_t *)dst_nram + count * align_num, - (int32_t *)dst_nram, count * align_num, - count * align_num); - count *= 2; - } - if (remain > 0) { - __bang_add_scalar((int32_t *)dst_nram + count * align_num, - (int32_t *)dst_nram, count * align_num, - remain * align_num); - } - if (global_remain > 0) { - __bang_add_scalar( - (int32_t *)dst_nram + count * align_num + remain * align_num, - (int32_t *)dst_nram, count * align_num + remain * align_num, - global_remain); - } - __sync(); -#endif -} - -/* -input: nram_input l -output: nram_output k,l -func: expand k nums input l -*/ -__mlu_func__ void expandInput(int32_t *nram_input, int32_t deal_num, - int32_t k) { - int offset = deal_num; - for (int i = 0; i < k; ++i) { - __bang_add_scalar((int32_t *)nram_input + offset, (int32_t *)nram_input, - (int32_t)0, deal_num); - offset += deal_num; - } -} - -/* -input: input_pos, fliter_pos, stride, padding, dilation -output: do ho wo -func: generate do ho wo -*/ -__mlu_func__ void computeOutputIndex(float *nram_output, float *nram_input, - float *temp, float *filter_kd_index, - float *filter_kh_index, - float *filter_kw_index, int32_t deal_num, - int32_t kdhw, Stride stride, - Dilation dilation, Padding padding) { - // formula: output_id = (input_id + padding - k_id * dilation) / stride - float stride_sd = 1.0 / (float)stride.s_d; - float stride_sh = 1.0 / (float)stride.s_h; - float stride_sw = 1.0 / (float)stride.s_w; - int32_t offset = deal_num; - for (int i = 0; i < 3; ++i) { - int32_t out_offset = offset - deal_num; - float stride_s = i == 0 ? stride_sd : i == 1 ? stride_sh : stride_sw; - int32_t padding_p = - i == 0 ? padding.p_d : i == 1 ? (padding.p_h) : (padding.p_w); - int32_t dilation_d = - i == 0 ? dilation.d_d : i == 1 ? dilation.d_h : dilation.d_w; - float *temp_filter_index = - i == 0 ? filter_kd_index : (i == 1 ? filter_kh_index : filter_kw_index); - __bang_add_scalar(nram_output + out_offset, nram_input + offset, - (float)(padding_p), deal_num); - __bang_mul_scalar(temp, temp_filter_index, (float)(dilation_d), kdhw); - __bang_cycle_sub(nram_output + out_offset, nram_output + out_offset, temp, - deal_num, kdhw); - __bang_mul_scalar(nram_output + out_offset, nram_output + out_offset, - stride_s, deal_num); - offset += deal_num; - } -} - -/* -input: nram_input float 3 k l do ho wo -output: nram_output float k l -func: generate mask represent output -*/ -__mlu_func__ void computeMask(float *nram_output, float *nram_input, - float *temp, int32_t deal_num, - OutputSpace output_space) { - int32_t o_d = output_space.o_d, o_h = output_space.o_h, - o_w = output_space.o_w; - int32_t offset = 0; - int32_t offset_temp2 = deal_num; - int32_t offset_temp3 = 2 * deal_num; - __bang_write_value((float *)nram_output, deal_num, (float)1.0); - for (int i = 0; i < 3; ++i) { - int32_t output_dim = i == 0 ? o_d : i == 1 ? o_h : o_w; - __bang_float2int32_tz((int32_t *)temp, (float *)nram_input + offset, - deal_num, 0); - __bang_int322float_rn((float *)temp, (int32_t *)temp, deal_num, 0); - __bang_sub((float *)temp + offset_temp2, (float *)temp, - (float *)nram_input + offset, deal_num); - __bang_le_scalar((float *)temp + offset_temp3, (float *)temp + offset_temp2, - (float)0.000001, deal_num); // < 1e-6 - __bang_ge_scalar((float *)temp + offset_temp2, (float *)temp + offset_temp2, - (float)-0.000001, deal_num); // > -1e-6 - __bang_and((float *)temp + offset_temp2, (float *)temp + offset_temp2, - (float *)temp + offset_temp3, deal_num); - __bang_ge_scalar((float *)temp + offset_temp3, (float *)temp, (float)0.0, - deal_num); - __bang_and((float *)temp + offset_temp2, (float *)temp + offset_temp2, - (float *)temp + offset_temp3, deal_num); - __bang_le_scalar((float *)temp + offset_temp3, (float *)temp, - (float)(output_dim - 1), deal_num); - __bang_and((float *)temp, (float *)temp + offset_temp3, - (float *)temp + offset_temp2, deal_num); - __bang_and((float *)nram_output, (float *)nram_output, (float *)temp, - deal_num); - offset += deal_num; - } -} - -/* -input: nram_input int32_t l,4 n do ho wo -output: nram_output int32_t l indice_out_expand -func: generate all_index from n do ho wo index -*/ -__mlu_func__ void genIndiceOutput(int32_t *nram_output, float *batch, - float *nram_input, int32_t *temp, - int32_t deal_num, OutputSpace output_space) { - int32_t o_d = output_space.o_d, o_h = output_space.o_h, - o_w = output_space.o_w; - int32_t o_hw = o_h * o_w, o_dhw = o_d * o_h * o_w; - __bang_float2int32_tz((int32_t *)temp + deal_num, (float *)batch, deal_num, - 0); // n - __bang_mul_scalar((int32_t *)temp, (int32_t *)temp + deal_num, (int32_t)o_dhw, - deal_num); // n * odhw - __bang_float2int32_tz((int32_t *)temp + 2 * deal_num, (float *)nram_input, - deal_num, 0); // do - __bang_mul_scalar((int32_t *)temp + deal_num, (int32_t *)temp + 2 * deal_num, - (int32_t)o_hw, deal_num); // do * o_hw - __bang_add((int32_t *)temp, (int32_t *)temp, (int32_t *)temp + deal_num, - deal_num); - __bang_float2int32_tz((int32_t *)temp + 2 * deal_num, - (float *)nram_input + deal_num, deal_num, 0); // ho - __bang_mul_scalar((int32_t *)temp + deal_num, (int32_t *)temp + 2 * deal_num, - (int32_t)o_w, deal_num); - __bang_add((int32_t *)temp, (int32_t *)temp, (int32_t *)temp + deal_num, - deal_num); - __bang_float2int32_tz((int32_t *)temp + deal_num, - (float *)nram_input + 2 * deal_num, deal_num, 0); // wo - __bang_add((int32_t *)nram_output, (int32_t *)temp, - (int32_t *)temp + deal_num, deal_num); -} - -/* -input: nram_output int32_t k,l indice_outout_expand - mask_all float k,l mask_all -output nram_output int32_t k,l indice_output_expand -func: turn invalid index into int_max -*/ -__mlu_func__ void genIndiceOutExpand(int32_t *nram_output, int32_t *mask_all, - int32_t *nram_input, int32_t *temp, - int32_t deal_num, int32_t output_size) { - __bang_mul_scalar((int32_t *)temp, (int32_t *)mask_all, int(-1), deal_num); - __bang_band((char *)nram_output, (char *)nram_input, (char *)temp, - deal_num * sizeof(int32_t)); - // clost to intmax - __bang_sub_scalar((int32_t *)temp, (int32_t *)mask_all, int(1), deal_num); - __bang_mul_scalar((int32_t *)temp, (int32_t *)temp, int(-1 * output_size), - deal_num); - __bang_bor((char *)nram_output, (char *)nram_output, (char *)temp, - deal_num * sizeof(int32_t)); -} - -/* -input: nram_input int32_t L indice_out_expand -output: nram_output int32_t L,4 indice_out -func: generate n,do,ho,wo index from input all_index -limits: imp on 300 -*/ -__mlu_func__ void genIndiceOutLast(int32_t *nram_output, int32_t *nram_input, - int32_t *nram_aux, OutputSpace output_space, - int32_t deal_num) { -#if __BANG_ARCH__ >= 590 - int32_t o_d = output_space.o_d, o_h = output_space.o_h, - o_w = output_space.o_w; - int32_t o_hw = o_h * o_w, o_dhw = o_d * o_h * o_w; - __bang_div((int32_t *)nram_aux, (int32_t *)nram_input, (int)o_dhw, - deal_num); // n - __bang_mul_scalar((int32_t *)nram_output, (int32_t *)nram_aux, (int)o_dhw, - deal_num); - __bang_sub((int32_t *)nram_input, (int32_t *)nram_input, (int *)nram_output, - deal_num); - __bang_div((int32_t *)nram_aux + deal_num, (int32_t *)nram_input, (int)o_hw, - deal_num); // d - __bang_mul_scalar((int32_t *)nram_output, (int32_t *)nram_aux + deal_num, - (int)o_hw, deal_num); - __bang_sub((int32_t *)nram_input, (int32_t *)nram_input, - (int32_t *)nram_output, deal_num); - - __bang_div((int32_t *)nram_aux + 2 * deal_num, (int32_t *)nram_input, - (int)o_w, deal_num); // h - __bang_mul_scalar((int32_t *)nram_output, (int32_t *)nram_aux + 2 * deal_num, - (int)o_w, deal_num); - __bang_sub((int32_t *)nram_aux + 3 * deal_num, (int32_t *)nram_input, - (int32_t *)nram_output, deal_num); // w - __bang_transpose((int32_t *)nram_output, (int32_t *)nram_aux, 4, deal_num); -#else - int32_t o_d = output_space.o_d, o_h = output_space.o_h, - o_w = output_space.o_w; - int32_t o_hw = o_h * o_w, o_dhw = o_d * o_h * o_w; - __bang_write_value((int32_t *)nram_aux + 4 * deal_num, deal_num, int(o_dhw)); - __cn_vector_div_s32(deal_num, (int32_t *)nram_aux, (int32_t *)nram_input, - (int32_t *)nram_aux + 4 * deal_num); - __bang_mul_scalar((int32_t *)nram_output, (int32_t *)nram_aux, (int)o_dhw, - deal_num); - __bang_sub((int32_t *)nram_input, (int32_t *)nram_input, (int *)nram_output, - deal_num); - __bang_write_value((int32_t *)nram_aux + 4 * deal_num, deal_num, int(o_hw)); - __cn_vector_div_s32(deal_num, (int32_t *)nram_aux + deal_num, - (int32_t *)nram_input, - (int32_t *)nram_aux + 4 * deal_num); - __bang_mul_scalar((int32_t *)nram_output, (int32_t *)nram_aux + deal_num, - (int)o_hw, deal_num); - __bang_sub((int32_t *)nram_input, (int32_t *)nram_input, - (int32_t *)nram_output, deal_num); - - __bang_write_value((int32_t *)nram_aux + 4 * deal_num, deal_num, int(o_w)); - __cn_vector_div_s32(deal_num, (int32_t *)nram_aux + 2 * deal_num, - (int32_t *)nram_input, - (int32_t *)nram_aux + 4 * deal_num); - __bang_mul_scalar((int32_t *)nram_output, (int32_t *)nram_aux + 2 * deal_num, - (int)o_w, deal_num); - __bang_sub((int32_t *)nram_aux + 3 * deal_num, (int32_t *)nram_input, - (int32_t *)nram_output, deal_num); // w - __bang_transpose((int32_t *)nram_output, (int32_t *)nram_aux, 4, deal_num); -#endif -} - -/* -input: nram_input int32_t l,4 indice_in -output: nram_output int32_t l indice_in_expand -func: generate all_index from n di hi wi index -*/ -__mlu_func__ void genIndiceInExpand(int32_t *nram_output, int32_t *nram_input, - int32_t *nram_aux, int32_t deal_num, - InputSpace input_space) { - __bang_transpose((int32_t *)nram_aux, (int32_t *)nram_input, deal_num, 4); - int32_t i_d = input_space.i_d, i_h = input_space.i_h, i_w = input_space.i_w; - int32_t i_hw = i_h * i_w, i_dhw = i_d * i_h * i_w; - __bang_mul_scalar((int32_t *)nram_aux + 4 * deal_num, - (int32_t *)nram_aux + 2 * deal_num, int32_t(i_w), deal_num); - __bang_add((int32_t *)nram_output, (int32_t *)nram_aux + 4 * deal_num, - (int32_t *)nram_aux + 3 * deal_num, deal_num); - __bang_mul_scalar((int32_t *)nram_aux + 4 * deal_num, - (int32_t *)nram_aux + deal_num, int32_t(i_hw), deal_num); - __bang_add((int32_t *)nram_output, (int32_t *)nram_output, - (int32_t *)nram_aux + 4 * deal_num, deal_num); - __bang_mul_scalar((int32_t *)nram_aux + 4 * deal_num, (int32_t *)nram_aux, - int32_t(i_dhw), deal_num); - __bang_add((int32_t *)nram_output, (int32_t *)nram_output, - (int32_t *)nram_aux + 4 * deal_num, deal_num); -} -#endif -#endif // KERNELS_GET_INDICE_PAIRS_GET_INDICE_PAIRS_UTILS_H_ diff --git a/kernels/get_indice_pairs/normal_get_indice_pairs.cpp b/kernels/get_indice_pairs/normal_get_indice_pairs.cpp deleted file mode 100644 index 5f96b1053..000000000 --- a/kernels/get_indice_pairs/normal_get_indice_pairs.cpp +++ /dev/null @@ -1,1299 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include -#include -#include - -#include "core/context.h" -#include "core/logging.h" -#include "core/mlu_env.h" -#include "core/runtime/device.h" -#include "core/tensor.h" -#include "kernels/get_indice_pairs/get_indice_pairs_structs.h" -#include "kernels/get_indice_pairs/normal_get_indice_pairs.h" -#include "kernels/kernel.h" -#include "kernels/utils/cnnl_helper.h" -#include "mlu_op.h" - -static mluOpStatus_t getIndiceMaskAll( - const mluOpTensorDescriptor_t indice_pairs_desc, const int kernel_volume, - const int input_active_site, size_t *size) { - size_t total_size = 0; - total_size = - kernel_volume * input_active_site * sizeof(indice_pairs_desc->dtype); - size[0] = total_size; - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t getIndiceIndexIn( - const mluOpTensorDescriptor_t indice_pairs_desc, const int kernel_volume, - const int input_active_site, size_t *size) { - size_t total_size = 0; - total_size = - kernel_volume * input_active_site * sizeof(indice_pairs_desc->dtype); - size[0] = total_size; - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t getIndiceIndexOut( - const mluOpTensorDescriptor_t indice_pairs_desc, const int kernel_volume, - const int input_active_site, size_t *size) { - size_t total_size = 0; - total_size = - kernel_volume * input_active_site * sizeof(indice_pairs_desc->dtype); - size[0] = total_size; - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t getIndiceOutExpand( - const mluOpTensorDescriptor_t indice_pairs_desc, const int kernel_volume, - const int input_active_site, size_t *size) { - size_t total_size = 0; - total_size = - kernel_volume * input_active_site * sizeof(indice_pairs_desc->dtype); - size[0] = total_size; - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t getIndiceInExpand( - const mluOpTensorDescriptor_t indice_pairs_desc, - const int input_active_site, size_t *size) { - size_t total_size = 0; - total_size = input_active_site * sizeof(indice_pairs_desc->dtype); - size[0] = total_size; - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t getIndiceUnique( - const mluOpTensorDescriptor_t indice_pairs_desc, const int kernel_volume, - const int input_active_site, size_t *size) { - size_t total_size = 0; - total_size = (kernel_volume * input_active_site + 1) * - sizeof(indice_pairs_desc->dtype); - size[0] = total_size; - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t getGridOut(const mluOpTensorDescriptor_t indice_pairs_desc, - int output_size, size_t *size) { - size_t total_size = 0; - total_size = output_size * sizeof(indice_pairs_desc->dtype); - size[0] = total_size; - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t getReduceOpWS(mluOpHandle_t handle, - const std::string interface_name, - const int kernel_volume, - const int input_active_site, size_t *size) { - size_t total_size = 0; - mluOpTensorDescriptor_t reduce_in_desc, reduce_out_desc; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&reduce_in_desc)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&reduce_out_desc)); - std::vector reduce_in_dims = {kernel_volume, input_active_site}; - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor( - reduce_in_desc, MLUOP_LAYOUT_ARRAY, MLUOP_DTYPE_INT32, - reduce_in_dims.size(), reduce_in_dims.data())); - reduce_in_dims[1] = 1; - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor( - reduce_out_desc, MLUOP_LAYOUT_ARRAY, MLUOP_DTYPE_INT32, - reduce_in_dims.size(), reduce_in_dims.data())); - // reduce along lowest dimension - int axis[1] = {1}; - int axis_num = 1; - cnnlReduceDescriptor_t reduce_desc; - CALL_CNNL(cnnlCreateReduceDescriptor(&reduce_desc)); - CALL_CNNL(cnnlSetReduceDescriptor( - reduce_desc, axis, axis_num, CNNL_REDUCE_ADD, - cnnlDataType_t(reduce_in_desc->dtype), CNNL_PROPAGATE_NAN, - CNNL_REDUCE_NO_INDICES, CNNL_16BIT_INDICES)); - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(reduce_in_desc, - cnnl_input_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(reduce_out_desc, - cnnl_output_desc); - CALL_CNNL(cnnlGetReduceOpWorkspaceSize(cnnl_handle, cnnl_input_desc, - cnnl_output_desc, reduce_desc, - &total_size)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(reduce_in_desc)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(reduce_out_desc)); - CALL_CNNL(cnnlDestroyReduceDescriptor(reduce_desc)); - size[0] = total_size; - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t getUniqueOpWS(mluOpHandle_t handle, - const std::string interface_name, - const mluOpTensorDescriptor_t indices_desc, - const int kernel_volume, - const int input_active_site, size_t *size) { - size_t total_size = 0; - mluOpTensorDescriptor_t input_unique_desc; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&input_unique_desc)); - std::vector unique_in_dims = {kernel_volume * input_active_site}; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor(input_unique_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_INT32, unique_in_dims.size(), - unique_in_dims.data())); - - { - cnnlUniqueSort_t unique_mode = CNNL_SORT_ASCEND; - cnnlUniqueDescriptor_t unique_desc; - - CALL_CNNL(cnnlCreateUniqueDescriptor(&unique_desc)); - CALL_CNNL( - cnnlSetUniqueDescriptor(unique_desc, unique_mode, 0, false, false)); - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_unique_desc, - cnnl_input_desc); - CALL_CNNL(cnnlGetUniqueWorkspaceSize(cnnl_handle, unique_desc, - cnnl_input_desc, &total_size)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - - CALL_CNNL(cnnlDestroyUniqueDescriptor(unique_desc)); - } - - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(input_unique_desc)); - - size[0] = total_size; - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t getNormalGetIndicePairsWorkspaceSize( - mluOpHandle_t handle, const std::string interface_name, - mluOpSparseConvolutionDescriptor_t sparse_conv_desc, - const mluOpTensorDescriptor_t indices_desc, - const mluOpTensorDescriptor_t indice_pairs_desc, - const mluOpTensorDescriptor_t out_indices_desc, - const mluOpTensorDescriptor_t indice_num_desc, size_t *return_ws) { - // workspace for get_indice_pairs - size_t total_size = 0; - int sub_m = sparse_conv_desc->sub_m; - int batch = sparse_conv_desc->batch; - int kernel_volume = indice_pairs_desc->dims[0]; - int input_active_site = indice_pairs_desc->dims[2]; - int output_size = batch * sparse_conv_desc->output_space[0] * - sparse_conv_desc->output_space[1] * - sparse_conv_desc->output_space[2] + - 1; - size_t mask_all_ws = 0, indice_index_in_ws = 0, indice_index_out_ws = 0; - size_t out_indices_expand_ws = 0, grid_out_ws = 0, reduce_op_ws = 0; - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - getIndiceMaskAll(indice_pairs_desc, kernel_volume, - input_active_site, &mask_all_ws)); - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - getIndiceIndexIn(indice_pairs_desc, kernel_volume, - input_active_site, &indice_index_in_ws)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - getIndiceIndexOut(indice_pairs_desc, kernel_volume, input_active_site, - &indice_index_out_ws)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - getIndiceOutExpand(indice_pairs_desc, kernel_volume, - input_active_site, &out_indices_expand_ws)); - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - getGridOut(indice_pairs_desc, output_size, &grid_out_ws)); - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - getReduceOpWS(handle, interface_name, kernel_volume, - input_active_site, &reduce_op_ws)); - if (sub_m) { - /* workspace for subm mode - | mask_all |indices_index_in | indices_index_out/ step_index | - indices_in_expand |out_indices_expand| max(grid_out_ws, reduce_op_ws)| - */ - size_t indice_in_expand_ws = 0; - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - getIndiceInExpand(indice_pairs_desc, input_active_site, - &indice_in_expand_ws)); - total_size = mask_all_ws + indice_index_in_ws + indice_index_out_ws + - out_indices_expand_ws + indice_in_expand_ws + - std::max(grid_out_ws, reduce_op_ws); - } else { - /* workspace for default mode - | mask_all | indices_index_in | step_index/ indices_index_out | - out_indices_expand | | out_indices_unique | max(grid_out_ws, reduce_ws, - unique_ws) | - */ - size_t indice_unique_ws = 0, unique_op_ws = 0; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - getUniqueOpWS(handle, interface_name, indices_desc, kernel_volume, - input_active_site, &unique_op_ws)); - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - getIndiceUnique(indice_pairs_desc, kernel_volume, - input_active_site, &indice_unique_ws)); - total_size = mask_all_ws + indice_index_in_ws + indice_index_out_ws + - out_indices_expand_ws + indice_unique_ws + - std::max(grid_out_ws, std::max(reduce_op_ws, unique_op_ws)); - } - return_ws[0] = total_size; - return MLUOP_STATUS_SUCCESS; -} - -/* DefaultKernel1 -intput: indices l,4 int -output: mask_all k,l int - indice_index_in k,l int - out_indices_expand k,l int -func: gen mask_all, indice_index_in, out_indices_expand for next step. -*/ -mluOpStatus_t launchDefaultKernel1( - mluOpHandle_t handle, - const mluOpSparseConvolutionDescriptor_t sparse_conv_desc, - const void *indices, void *mask_all_ws, void *indice_index_in_ws, - void *out_indices_expand_ws, int batch, int kernel_volume, - int input_active_site) { - cnrtDim3_t kDim3; - cnrtFunctionType_t func_type; - int core_dim = mluop::runtime::getCoreNumOfEachUnionCapability(handle); - int cluster_number = mluop::runtime::getClusterLimitCapability(handle); - int core_nums = core_dim * cluster_number; - int nram_size = handle->nram_size + REM_FOR_STACK - 12 * 1024; - int nums = 19 * kernel_volume + 8; - int core_num_l = (nram_size - 4 * 4096 * 3) / nums / sizeof(int); - int jobs = (input_active_site + core_num_l - 1) / core_num_l; - int job_num = jobs > core_nums ? core_nums : jobs; - func_type = CNRT_FUNC_TYPE_BLOCK; - kDim3.x = 1; - kDim3.y = job_num; - kDim3.z = 1; - /* nram_space */ - // |input| mask_all | indice_index_in | out_indices_expand | l + 3 k l - // |input| mask_all | indice_index_in | out_indices_expand | l + 3 k l - // | nram_aux_a 5 l k | nram_aux_b 8 l k - // ping + pong + aux - FilterSpace filter_space(sparse_conv_desc->filter_space[0], - sparse_conv_desc->filter_space[1], - sparse_conv_desc->filter_space[2]); - InputSpace input_space(sparse_conv_desc->input_space[0], - sparse_conv_desc->input_space[1], - sparse_conv_desc->input_space[2]); - OutputSpace output_space(sparse_conv_desc->output_space[0], - sparse_conv_desc->output_space[1], - sparse_conv_desc->output_space[2]); - Stride stride(sparse_conv_desc->stride[0], sparse_conv_desc->stride[1], - sparse_conv_desc->stride[2]); - Dilation dilation(sparse_conv_desc->dilation[0], - sparse_conv_desc->dilation[1], - sparse_conv_desc->dilation[2]); - Padding padding(sparse_conv_desc->pad[0], sparse_conv_desc->pad[1], - sparse_conv_desc->pad[2]); - VLOG(5) << "[getIndicePairsDefault] Launch kernel " - "KernelDefaultGetIndicePairKl1<<>>"; - CHECK_RETURN( - "[getIndicePairsDefault]", - KernelDefaultGetIndicePairKl1( - kDim3, func_type, handle->queue, (void *)mask_all_ws, - (void *)indice_index_in_ws, (void *)out_indices_expand_ws, - (void *)indices, filter_space, input_space, output_space, stride, - dilation, padding, core_num_l, input_active_site, batch)); - return MLUOP_STATUS_SUCCESS; -} - -/* SubmKernel1 -intput: indices l,4 int -output: mask_all k,l int - indice_index_in k,l int - indice_in_expand l, int - out_indices_expand k,l int -func: gen mask_all, indice_index_in, indice_in_expand, out_indices_expand for -next step. -*/ -mluOpStatus_t launchSubmKernel1( - mluOpHandle_t handle, - const mluOpSparseConvolutionDescriptor_t sparse_conv_desc, - const void *indices, void *mask_all_ptr, void *indice_index_in_ptr, - void *indice_in_expand_ptr, void *out_indices_expand_ptr, int batch, - int kernel_volume, int input_active_site) { - cnrtDim3_t kDim3; - cnrtFunctionType_t func_type; - int core_dim = mluop::runtime::getCoreNumOfEachUnionCapability(handle); - int cluster_number = mluop::runtime::getClusterLimitCapability(handle); - int core_nums = core_dim * cluster_number; - int nram_size = handle->nram_size + REM_FOR_STACK - 12 * 1024; - int nums = 19 * kernel_volume + 10; - int core_num_l = (nram_size - 4 * 4096 * 3) / nums / sizeof(int); - int jobs = (input_active_site + core_num_l - 1) / core_num_l; - int least_jobs = (input_active_site * sizeof(int) + 1024 - 1) / 1024; - jobs = std::max(jobs, least_jobs); - int job_num = jobs > core_nums ? core_nums : jobs; - func_type = CNRT_FUNC_TYPE_BLOCK; - kDim3.x = 1; - kDim3.y = job_num; - kDim3.z = 1; - /* nram_space - |input| mask_all | indice_index_in | out_indices_expand | indice_in_expand - |4l + l + 3kl |input| mask_all | indice_index_in | out_indices_expand | - indice_in_expand |4l + l + 3kl | nram_aux_a 5lk | nram_aux_b 8lk | - */ - FilterSpace filter_space(sparse_conv_desc->filter_space[0], - sparse_conv_desc->filter_space[1], - sparse_conv_desc->filter_space[2]); - InputSpace input_space(sparse_conv_desc->input_space[0], - sparse_conv_desc->input_space[1], - sparse_conv_desc->input_space[2]); - OutputSpace output_space(sparse_conv_desc->output_space[0], - sparse_conv_desc->output_space[1], - sparse_conv_desc->output_space[2]); - Stride stride(sparse_conv_desc->stride[0], sparse_conv_desc->stride[1], - sparse_conv_desc->stride[2]); - Dilation dilation(sparse_conv_desc->stride[0], sparse_conv_desc->stride[1], - sparse_conv_desc->stride[2]); - Padding padding(sparse_conv_desc->pad[0], sparse_conv_desc->pad[1], - sparse_conv_desc->pad[2]); - VLOG(5) << "[getIndicePairsDefault] Launch kernel " - "KernelSubmGetIndicePairKl1<<>>"; - CHECK_RETURN("[getIndicePairsDefault]", - KernelSubmGetIndicePairKl1( - kDim3, func_type, handle->queue, (void *)mask_all_ptr, - (void *)indice_index_in_ptr, (void *)indice_in_expand_ptr, - (void *)out_indices_expand_ptr, (void *)indices, - filter_space, input_space, output_space, stride, dilation, - padding, core_num_l, input_active_site, batch)); - return MLUOP_STATUS_SUCCESS; -} - -/* SubmKernel2 -intput: indices l,4 int - out_indices_expand_ptr k,l int - mask_all_ptr k,l int -output: mask_all k,l int - out_indices l,4 int -func: gen out_indices from indices in subm mode; - gen mask_all by and out_indices_expand_ptr/ mask_all_ptr. -*/ -mluOpStatus_t launchSubmKernel2(mluOpHandle_t handle, const void *indices, - void *out_indices_index_ptr, void *mask_all_ptr, - void *out_indices, int kernel_volume, - int input_active_site) { - cnrtDim3_t kDim3; - cnrtFunctionType_t func_type; - int core_dim = mluop::runtime::getCoreNumOfEachUnionCapability(handle); - int cluster_number = mluop::runtime::getClusterLimitCapability(handle); - int core_nums = core_dim * cluster_number; - int nram_size = handle->nram_size + REM_FOR_STACK - 12 * 1024; - int core_num_l_two = (nram_size - 4 * 4096 * 3) / 2 / sizeof(int); - int core_num_l_one = (nram_size - 4 * 4096 * 3) / sizeof(int); - int len_1_one = input_active_site * 4; - int len_l_two = input_active_site * kernel_volume; - int jobs_one = (len_1_one + core_num_l_one - 1) / core_num_l_one; - int jobs_two = (len_l_two + core_num_l_two - 1) / core_num_l_two; - int least_job_one = (len_1_one * sizeof(int) + 1024 - 1) / 1024; - int least_job_two = (len_l_two * sizeof(int) + 1024 - 1) / 1024; - int least_jobs = std::max(least_job_one, least_job_two); - int jobs = std::max(std::max(jobs_one, jobs_two), least_jobs); - int job_num = jobs > core_nums ? core_nums : jobs; - func_type = CNRT_FUNC_TYPE_BLOCK; - kDim3.x = 1; - kDim3.y = job_num; - kDim3.z = 1; - VLOG(5) << "[getIndicePairsDefault] Launch kernel " - "KernelSubmGetIndicePairKl2<<>>"; - CHECK_RETURN( - "[getIndicePairsDefault]", - KernelSubmGetIndicePairKl2( - kDim3, func_type, handle->queue, (void *)out_indices, - (void *)mask_all_ptr, (void *)out_indices_index_ptr, (void *)indices, - len_1_one, len_l_two, core_num_l_one, core_num_l_two)); - return MLUOP_STATUS_SUCCESS; -} - -// call reduce op -mluOpStatus_t launchReduceOp(mluOpHandle_t handle, - const std::string interface_name, - void *reduce_output_addr, void *reduce_input_addr, - void *reduce_workspace_ptr, size_t reduce_op_ws, - int kernel_volume, int input_active_site) { - mluOpTensorDescriptor_t reduce_in_desc, reduce_out_desc; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&reduce_in_desc)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&reduce_out_desc)); - std::vector reduce_in_dims = {kernel_volume, input_active_site}; - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor( - reduce_in_desc, MLUOP_LAYOUT_ARRAY, MLUOP_DTYPE_INT32, - reduce_in_dims.size(), reduce_in_dims.data())); - reduce_in_dims[1] = 1; - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor( - reduce_out_desc, MLUOP_LAYOUT_ARRAY, MLUOP_DTYPE_INT32, - reduce_in_dims.size(), reduce_in_dims.data())); - // reduce along lowest dimension - int axis[1] = {1}; - int axis_num = 1; - cnnlReduceDescriptor_t reduce_desc; - CALL_CNNL(cnnlCreateReduceDescriptor(&reduce_desc)); - CALL_CNNL(cnnlSetReduceDescriptor( - reduce_desc, axis, axis_num, CNNL_REDUCE_ADD, - cnnlDataType_t(reduce_in_desc->dtype), CNNL_PROPAGATE_NAN, - CNNL_REDUCE_NO_INDICES, CNNL_16BIT_INDICES)); - void *alpha = NULL, *beta = NULL, *indices = NULL; - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(reduce_in_desc, - cnnl_input_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(reduce_out_desc, - cnnl_output_desc); - CALL_CNNL(cnnlReduce(cnnl_handle, reduce_desc, reduce_workspace_ptr, - reduce_op_ws, alpha, cnnl_input_desc, - reduce_input_addr, 0, indices, beta, cnnl_output_desc, - reduce_output_addr)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(reduce_in_desc)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(reduce_out_desc)); - CALL_CNNL(cnnlDestroyReduceDescriptor(reduce_desc)); - return MLUOP_STATUS_SUCCESS; -} - -// call unqiue_v2 op -mluOpStatus_t launchUniqueOp(mluOpHandle_t handle, - const std::string interface_name, - void *unique_output_addr, void *unique_input_addr, - void *unique_output_num_addr, - void *unique_workspace_ptr, size_t unique_op_ws, - int kernel_volume, int input_active_site, - int *return_num_act) { - mluOpTensorDescriptor_t unique_input_desc, unique_output_desc; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&unique_input_desc)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&unique_output_desc)); - std::vector unique_in_dims = {kernel_volume * input_active_site}; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor(unique_input_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_INT32, unique_in_dims.size(), - unique_in_dims.data())); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor(unique_output_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_INT32, unique_in_dims.size(), - unique_in_dims.data())); - { - cnnlUniqueSort_t unique_mode = CNNL_SORT_ASCEND; - cnnlUniqueDescriptor_t unique_desc; - - CALL_CNNL(cnnlCreateUniqueDescriptor(&unique_desc)); - CALL_CNNL( - cnnlSetUniqueDescriptor(unique_desc, unique_mode, 0, false, false)); - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(unique_input_desc, - cnnl_input_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(unique_output_desc, - cnnl_output_desc); - CALL_CNNL(cnnlUnique_v2(cnnl_handle, unique_desc, cnnl_input_desc, - unique_input_addr, unique_workspace_ptr, - unique_op_ws, (int *)unique_output_num_addr, - cnnl_output_desc, unique_output_addr, nullptr, - nullptr, nullptr, nullptr)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - - CALL_CNNL(cnnlDestroyUniqueDescriptor(unique_desc)); - } - cnrtQueueSync(handle->queue); - cnrtMemcpy(return_num_act, unique_output_num_addr, sizeof(float), - CNRT_MEM_TRANS_DIR_DEV2HOST); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(unique_input_desc)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(unique_output_desc)); - return MLUOP_STATUS_SUCCESS; -} - -/* -DefaultKernel2 -input: num_act_out -output: step_index -func: generate tensor incluing 0-num_act_out continuously -*/ -mluOpStatus_t launchDefaultKernel2(mluOpHandle_t handle, - void *step_index_output_ptr, - int num_act_out) { - cnrtDim3_t kDim3; - cnrtFunctionType_t func_type; - int core_dim = mluop::runtime::getCoreNumOfEachUnionCapability(handle); - int cluster_number = mluop::runtime::getClusterLimitCapability(handle); - int core_nums = core_dim * cluster_number; - int nram_size = handle->nram_size + REM_FOR_STACK - 12 * 1024; - int core_num_l = (nram_size - 4 * 4096 * 3) / sizeof(int); - int jobs = (num_act_out + core_num_l - 1) / core_num_l; - int job_num = jobs > core_nums ? core_nums : jobs; - func_type = CNRT_FUNC_TYPE_BLOCK; - kDim3.x = 1; - kDim3.y = job_num; - kDim3.z = 1; - VLOG(5) << "[getIndicePairsDefault] Launch kernel " - "KernelDefaultGetIndicePairKl2<<>>"; - CHECK_RETURN("[getIndicePairsDefault]", - KernelDefaultGetIndicePairKl2(kDim3, func_type, handle->queue, - step_index_output_ptr, num_act_out, - core_num_l)); - return MLUOP_STATUS_SUCCESS; -} - -/* -BalanceKernel -input: out_indices_expand_ptr -mask : mask_all_ptr -output: out_indices_expand_ptr -func: balance index distribution -*/ -mluOpStatus_t launchBalanceKernel(mluOpHandle_t handle, - const std::string interface_name, - void *balance_input_addr, - void *balance_output_addr, - void *balance_mask_addr, - int input_active_site, int kernel_volume, - int output_size) { - cnrtDim3_t kDim3; - cnrtFunctionType_t func_type; - int core_dim = mluop::runtime::getCoreNumOfEachUnionCapability(handle); - int cluster_number = mluop::runtime::getClusterLimitCapability(handle); - int core_nums = core_dim * cluster_number; - int nram_size = handle->nram_size + REM_FOR_STACK - 12 * 1024; - int core_num_l = (nram_size - 4 * 4096 * 3) / 8 / sizeof(int); - int jobs = (input_active_site * kernel_volume + core_num_l - 1) / core_num_l; - int job_num = jobs > core_nums ? core_nums : jobs; - func_type = CNRT_FUNC_TYPE_BLOCK; - kDim3.x = 1; - kDim3.y = job_num; - kDim3.z = 1; - VLOG(5) << "[getIndicePairsDefault] Launch kernel " - "KernelBalanceGetIndicePair<<>>"; - CHECK_RETURN("[getIndicePairsDefault]", - KernelBalanceGetIndicePair( - kDim3, func_type, handle->queue, balance_input_addr, - balance_mask_addr, balance_output_addr, input_active_site, - kernel_volume, core_num_l, output_size)); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t launchFillOp(mluOpHandle_t handle, - const std::string interface_name, - void *mluOp_fill_addr, int output_size, - int fill_value) { - mluOpTensorDescriptor_t fill_tensor_desc; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&fill_tensor_desc)); - std::vector fill_in_dims = {output_size}; - INTERNAL_CHECK(interface_name, MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor( - fill_tensor_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_INT32, fill_in_dims.size(), - fill_in_dims.data())); - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(fill_tensor_desc, - cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value, - cnnl_output_desc, mluOp_fill_addr)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(fill_tensor_desc)); - return MLUOP_STATUS_SUCCESS; -} - -// call scatter_nd op -mluOpStatus_t launchScatterNdOp(mluOpHandle_t handle, - const std::string interface_name, - void *scatter_output_addr, - void *scatter_input_addr, - void *scatter_indice_addr, int output_size, - int num_act_out) { - VLOG(5) << interface_name << " call scatterNd"; - cnnlScatterNdMode_t scatter_mode = CNNL_SCATTERND_UPDATE; - mluOpTensorDescriptor_t scatter_input_desc, scatter_output_desc, - scatter_indice_desc; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&scatter_input_desc)); - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - mluOpCreateTensorDescriptor(&scatter_output_desc)); - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - mluOpCreateTensorDescriptor(&scatter_indice_desc)); - std::vector scatter_in_dims = {num_act_out}; - std::vector scatter_out_dims = {output_size}; - std::vector scatter_indice_dims = {num_act_out, 1}; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpSetTensorDescriptor( - scatter_indice_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_INT32, scatter_indice_dims.size(), - scatter_indice_dims.data())); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor(scatter_input_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_INT32, scatter_in_dims.size(), - scatter_in_dims.data())); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor(scatter_output_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_INT32, scatter_out_dims.size(), - scatter_out_dims.data())); - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(scatter_indice_desc, - cnnl_indices_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(scatter_input_desc, - cnnl_updates_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(scatter_output_desc, - cnnl_input_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(scatter_output_desc, - cnnl_output_desc); - - CALL_CNNL(cnnlScatterNd_v2( - cnnl_handle, scatter_mode, cnnl_indices_desc, scatter_indice_addr, - cnnl_updates_desc, scatter_input_addr, cnnl_input_desc, - scatter_output_addr, cnnl_output_desc, scatter_output_addr)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_updates_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(scatter_input_desc)); - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - mluOpDestroyTensorDescriptor(scatter_output_desc)); - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - mluOpDestroyTensorDescriptor(scatter_indice_desc)); - return MLUOP_STATUS_SUCCESS; -} - -// call gather_nd op -mluOpStatus_t launchGatherNdOp(mluOpHandle_t handle, - const std::string interface_name, - void *gather_input_addr, - void *gather_output_addr, - void *gather_indice_addr, int input_active_site, - int kernel_volume, int output_size) { - VLOG(5) << interface_name << " call gatherNd"; - mluOpTensorDescriptor_t gather_input_desc, gather_output_desc, - gather_indice_desc; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&gather_input_desc)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&gather_output_desc)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&gather_indice_desc)); - std::vector gather_in_dims = {output_size}; - std::vector gather_indices_dims = {input_active_site * kernel_volume, 1}; - std::vector gather_out_dims = {input_active_site * kernel_volume}; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpSetTensorDescriptor( - gather_indice_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_INT32, gather_indices_dims.size(), - gather_indices_dims.data())); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor(gather_input_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_INT32, gather_in_dims.size(), - gather_in_dims.data())); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor(gather_output_desc, MLUOP_LAYOUT_ARRAY, - MLUOP_DTYPE_INT32, gather_out_dims.size(), - gather_out_dims.data())); - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(gather_input_desc, - cnnl_params_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(gather_indice_desc, - cnnl_indices_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(gather_output_desc, - cnnl_output_desc); - CALL_CNNL(cnnlGatherNd(cnnl_handle, cnnl_params_desc, gather_input_addr, - cnnl_indices_desc, gather_indice_addr, - cnnl_output_desc, gather_output_addr)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_params_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(gather_input_desc)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(gather_output_desc)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(gather_indice_desc)); - return MLUOP_STATUS_SUCCESS; -} - -/* DefaultKernel3 -input: tensor1: kl int32 indice_index_in - tensor2: kl int32 indice_index_out - tensor3: kl int32 mask -output: tensor: k2l -func: maskmove efficient data continuously by collect insts -*/ -mluOpStatus_t launchDefaultKernel3(mluOpHandle_t handle, void *output_addr, - void *input_addr, void *mask_addr, - int input_active_site, int kernel_volume) { - cnrtDim3_t kDim3; - cnrtFunctionType_t func_type; - int core_dim = mluop::runtime::getCoreNumOfEachUnionCapability(handle); - int cluster_number = mluop::runtime::getClusterLimitCapability(handle); - int core_nums = core_dim * cluster_number; - int nram_size = handle->nram_size + REM_FOR_STACK - 12 * 1024; - int core_num_l = (nram_size - 4 * 4096 * 3) / 4 / sizeof(int); - int jobs = 2 * kernel_volume; - int job_num = jobs > core_nums ? core_nums : jobs; - func_type = CNRT_FUNC_TYPE_BLOCK; - kDim3.x = 1; - kDim3.y = job_num; - kDim3.z = 1; - VLOG(5) << "[getIndicePairsDefault] Launch kernel " - "KernelDefaultGetIndicePairKl3<<>>"; - CHECK_RETURN("[getIndicePairsDefault]", - KernelDefaultGetIndicePairKl3( - kDim3, func_type, handle->queue, output_addr, input_addr, - mask_addr, input_active_site, kernel_volume, core_num_l)); - return MLUOP_STATUS_SUCCESS; -} - -/* -DefaultKernel4 -input: tensor num_act_out int -output: tensor num_act_out,4 int -func: generate tensor incluing 0-num_act_out continuously -*/ -mluOpStatus_t launchDefaultKernel4( - mluOpHandle_t handle, - const mluOpSparseConvolutionDescriptor_t sparse_conv_desc, - void *output_addr, void *input_addr, int num_act_out) { - cnrtDim3_t kDim3; - cnrtFunctionType_t func_type; - int core_dim = mluop::runtime::getCoreNumOfEachUnionCapability(handle); - int cluster_number = mluop::runtime::getClusterLimitCapability(handle); - int core_nums = core_dim * cluster_number; - int nram_size = handle->nram_size + REM_FOR_STACK - 12 * 1024; - int core_num_split = 0; - if (handle->arch >= MLUOP_MLU590) { - core_num_split = 14; - } else { - core_num_split = 15; - } - int core_num_l = (nram_size - 4 * 4096 * 3) / core_num_split / sizeof(int); - int jobs = (num_act_out + core_num_l - 1) / core_num_l; - int job_num = jobs > core_nums ? core_nums : jobs; - func_type = CNRT_FUNC_TYPE_BLOCK; - kDim3.x = 1; - kDim3.y = job_num; - kDim3.z = 1; - OutputSpace output_space(sparse_conv_desc->output_space[0], - sparse_conv_desc->output_space[1], - sparse_conv_desc->output_space[2]); - - VLOG(5) << "[getIndicePairsDefault] Launch kernel " - "KernelDefaultGetIndicePairKl4<<>>"; - CHECK_RETURN("[getIndicePairsDefault]", - KernelDefaultGetIndicePairKl4( - kDim3, func_type, handle->queue, output_addr, input_addr, - output_space, num_act_out, core_num_l)); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t NormalGetIndicePairsKernel( - mluOpHandle_t handle, const std::string interface_name, - mluOpSparseConvolutionDescriptor_t sparse_conv_desc, - const mluOpTensorDescriptor_t indices_desc, const void *indices, - void *workspace, const mluOpTensorDescriptor_t indice_pairs_desc, - void *indice_pairs, const mluOpTensorDescriptor_t out_indices_desc, - void *out_indices, const mluOpTensorDescriptor_t indice_num_desc, - void *indice_num) { - int sub_m = sparse_conv_desc->sub_m; - int batch = sparse_conv_desc->batch; - int kernel_volume = indice_pairs_desc->dims[0]; - int input_active_site = indice_pairs_desc->dims[2]; - int output_size = batch * sparse_conv_desc->output_space[0] * - sparse_conv_desc->output_space[1] * - sparse_conv_desc->output_space[2] + - 1; - - if (sub_m) { - /* workspace for subm mode - | mask_all |indices_index_in | indices_index_out/ step_index | - indices_in_expand |out_indices_expand| | max(grid_out, reduce_op_ws)| - */ - size_t mask_all_ws = 0, indice_index_in_ws = 0, indice_index_out_ws = 0; - size_t indice_in_expand_ws = 0, out_indices_expand_ws = 0, grid_out_ws = 0; - size_t reduce_op_ws = 0; - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - getIndiceMaskAll(indice_pairs_desc, kernel_volume, - input_active_site, &mask_all_ws)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - getIndiceIndexIn(indice_pairs_desc, kernel_volume, - input_active_site, &indice_index_in_ws)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - getIndiceIndexOut(indice_pairs_desc, kernel_volume, - input_active_site, &indice_index_out_ws)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - getIndiceOutExpand(indice_pairs_desc, kernel_volume, - input_active_site, &out_indices_expand_ws)); - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - getIndiceInExpand(indice_pairs_desc, input_active_site, - &indice_in_expand_ws)); - INTERNAL_CHECK(interface_name, MLUOP_STATUS_SUCCESS == - getGridOut(indice_pairs_desc, - output_size, &grid_out_ws)); - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - getReduceOpWS(handle, interface_name, kernel_volume, - input_active_site, &reduce_op_ws)); - const void *compute_indices_ptr = indices; - void *mask_all_ptr = (void *)((char *)workspace); - void *indice_index_in_ptr = (void *)((char *)workspace + mask_all_ws); - void *indice_in_expand_ptr = - (void *)((char *)workspace + mask_all_ws + indice_index_in_ws + - indice_index_out_ws); - void *out_indices_expand_ptr = - (void *)((char *)workspace + mask_all_ws + indice_index_in_ws + - indice_index_out_ws + indice_in_expand_ws); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - launchSubmKernel1(handle, sparse_conv_desc, compute_indices_ptr, - mask_all_ptr, indice_index_in_ptr, - indice_in_expand_ptr, out_indices_expand_ptr, - batch, kernel_volume, input_active_site)); - - // call launchDefaultKernel2 gen step_index - void *step_index_addr = NULL; - step_index_addr = - (void *)((char *)(char *)workspace + mask_all_ws + indice_index_in_ws); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - launchDefaultKernel2(handle, step_index_addr, input_active_site)); - - // call scatter_nd unique_output_addr + step_index_addr = grid_out_addr - void *scatter_input_addr = NULL, *scatter_output_addr = NULL, - *scatter_indice_addr = NULL; - scatter_input_addr = step_index_addr; - scatter_indice_addr = indice_in_expand_ptr; - scatter_output_addr = (void *)((char *)workspace + mask_all_ws + - indice_index_in_ws + indice_index_out_ws + - indice_in_expand_ws + out_indices_expand_ws); - int fill_value = -1; - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - launchFillOp(handle, interface_name, scatter_output_addr, - output_size, fill_value)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - launchScatterNdOp(handle, interface_name, scatter_output_addr, - scatter_input_addr, scatter_indice_addr, - output_size, input_active_site)); - - // call gather_nd out_indices_expand + grid_out_addr = indice_index_out - void *gather_input_addr = NULL, *gather_output_addr = NULL, - *gather_indice_addr = NULL; - gather_output_addr = - (void *)((char *)workspace + mask_all_ws + indice_index_in_ws); - gather_input_addr = scatter_output_addr; - gather_indice_addr = out_indices_expand_ptr; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - launchGatherNdOp(handle, interface_name, gather_input_addr, - gather_output_addr, gather_indice_addr, - input_active_site, kernel_volume, output_size)); - - // call sumb_kernel2 indice_index_out and mask_all = mask_all - // get out_indices from indices - const void *kernel2_input1_addr = NULL; - void *kernel2_input2_addr = NULL, *kernel2_output1_addr = NULL, - *kernel2_output2_addr = NULL; - kernel2_input1_addr = indices; - kernel2_input2_addr = gather_output_addr; - kernel2_output1_addr = mask_all_ptr; - kernel2_output2_addr = out_indices; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - launchSubmKernel2(handle, kernel2_input1_addr, kernel2_input2_addr, - kernel2_output1_addr, kernel2_output2_addr, - kernel_volume, input_active_site)); - - // call reduceOp - void *reduce_input_addr = NULL, *reduce_output_addr = NULL; - reduce_input_addr = mask_all_ptr; - reduce_output_addr = indice_num; - void *reduce_workspace_ptr = NULL; - if (reduce_op_ws > 0) { - reduce_workspace_ptr = - (void *)((char *)workspace + mask_all_ws + indice_index_in_ws + - indice_index_out_ws + indice_in_expand_ws + - out_indices_expand_ws); - } - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - launchReduceOp(handle, interface_name, reduce_output_addr, - reduce_input_addr, reduce_workspace_ptr, - reduce_op_ws, kernel_volume, input_active_site)); - - // call launchDefaultKernel3 l k partition and sort - void *kernel3_input_addr = NULL, *kernel3_output_addr = NULL, - *kernel3_mask_addr = NULL; - kernel3_input_addr = indice_index_in_ptr; - kernel3_output_addr = indice_pairs; - kernel3_mask_addr = mask_all_ptr; - fill_value = -1; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - launchFillOp(handle, interface_name, indice_pairs, - kernel_volume * 2 * input_active_site, fill_value)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - launchDefaultKernel3(handle, kernel3_output_addr, - kernel3_input_addr, kernel3_mask_addr, - input_active_site, kernel_volume)); - } else { - /* workspace for default mode - | mask_all | indices_index_in | step_index/ indices_index_out | - out_indices_expand | | out_indices_unique | max(grid_out_ws, reduce_ws, - unique_ws) | - */ - size_t mask_all_ws = 0, indice_index_in_ws = 0, indice_index_out_ws = 0; - size_t out_indices_expand_ws = 0, indice_unique_ws = 0, grid_out_ws = 0; - size_t reduce_op_ws = 0, unique_op_ws = 0; - - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - getIndiceMaskAll(indice_pairs_desc, kernel_volume, - input_active_site, &mask_all_ws)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - getIndiceIndexIn(indice_pairs_desc, kernel_volume, - input_active_site, &indice_index_in_ws)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - getIndiceIndexOut(indice_pairs_desc, kernel_volume, - input_active_site, &indice_index_out_ws)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - getIndiceOutExpand(indice_pairs_desc, kernel_volume, - input_active_site, &out_indices_expand_ws)); - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - getIndiceUnique(indice_pairs_desc, kernel_volume, - input_active_site, &indice_unique_ws)); - INTERNAL_CHECK(interface_name, MLUOP_STATUS_SUCCESS == - getGridOut(indice_pairs_desc, - output_size, &grid_out_ws)); - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - getReduceOpWS(handle, interface_name, kernel_volume, - input_active_site, &reduce_op_ws)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - getUniqueOpWS(handle, interface_name, indices_desc, kernel_volume, - input_active_site, &unique_op_ws)); - const void *compute_indices_ptr = indices; - void *mask_all_ptr = (void *)((char *)workspace); - void *indice_index_in_ptr = (void *)((char *)workspace + mask_all_ws); - void *out_indices_expand_ptr = - (void *)((char *)workspace + mask_all_ws + indice_index_out_ws + - indice_index_in_ws); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - launchDefaultKernel1(handle, sparse_conv_desc, compute_indices_ptr, - mask_all_ptr, indice_index_in_ptr, - out_indices_expand_ptr, batch, kernel_volume, - input_active_site)); - - // call reduce_sum mask_all to indice_num - void *reduce_input_addr = NULL, *reduce_output_addr = NULL; - reduce_input_addr = mask_all_ptr; - reduce_output_addr = indice_num; - void *reduce_workspace_ptr = NULL; - if (reduce_op_ws > 0) { - reduce_workspace_ptr = (void *)((char *)workspace + mask_all_ws + - indice_index_in_ws + indice_index_out_ws + - out_indices_expand_ws + indice_unique_ws); - } - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - launchReduceOp(handle, interface_name, reduce_output_addr, - reduce_input_addr, reduce_workspace_ptr, - reduce_op_ws, kernel_volume, input_active_site)); - - // call unique_v2 out_indices_expand_ptr indice_unique_ws_ptr - int num_act_out = 0; - void *unique_input_addr = NULL, *unique_output_addr = NULL, - *unique_output_num_addr = NULL; - unique_input_addr = out_indices_expand_ptr; - unique_output_addr = - (void *)((char *)workspace + mask_all_ws + indice_index_in_ws + - indice_index_out_ws + out_indices_expand_ws); - unique_output_num_addr = - (void *)((char *)workspace + mask_all_ws + indice_index_in_ws); - void *unique_workspace_ptr = NULL; - if (unique_op_ws > 0) { - unique_workspace_ptr = (void *)((char *)workspace + mask_all_ws + - indice_index_in_ws + indice_index_out_ws + - out_indices_expand_ws + indice_unique_ws); - } - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - launchUniqueOp(handle, interface_name, unique_output_addr, - unique_input_addr, unique_output_num_addr, - unique_workspace_ptr, unique_op_ws, kernel_volume, - input_active_site, &num_act_out)); - - if (num_act_out != kernel_volume * input_active_site) { - num_act_out = num_act_out - 1; - } - if (num_act_out <= 0) { - // fill indice_pairs -1 indice_num 0 - int fill_value = -1; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - launchFillOp(handle, interface_name, indice_pairs, - kernel_volume * 2 * input_active_site, fill_value)); - fill_value = 0; - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - launchFillOp(handle, interface_name, indice_num, - kernel_volume, fill_value)); - return MLUOP_STATUS_SUCCESS; - } - sparse_conv_desc->num_act_out = num_act_out; - // call launchDefaultKernel2 gen step_index - void *step_index_addr = NULL; - step_index_addr = - (void *)((char *)workspace + mask_all_ws + indice_index_in_ws); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - launchDefaultKernel2(handle, step_index_addr, num_act_out)); - - // call balance out_indices_expand_ptr distr - void *balance_input_addr = NULL, *balance_output_addr = NULL, - *balance_mask_addr = NULL; - balance_input_addr = out_indices_expand_ptr; - balance_output_addr = out_indices_expand_ptr; - balance_mask_addr = mask_all_ptr; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - launchBalanceKernel(handle, interface_name, balance_input_addr, - balance_output_addr, balance_mask_addr, - input_active_site, kernel_volume, output_size)); - - // call scatter_nd unique_output_addr + step_index_addr = grid_out_addr - void *scatter_input_addr = NULL, *scatter_output_addr = NULL, - *scatter_indice_addr = NULL; - scatter_input_addr = step_index_addr; - scatter_indice_addr = unique_output_addr; - scatter_output_addr = (void *)((char *)workspace + mask_all_ws + - indice_index_in_ws + indice_index_out_ws + - out_indices_expand_ws + indice_unique_ws); - int fill_value = -1; - INTERNAL_CHECK(interface_name, - MLUOP_STATUS_SUCCESS == - launchFillOp(handle, interface_name, scatter_output_addr, - output_size, fill_value)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - launchScatterNdOp(handle, interface_name, scatter_output_addr, - scatter_input_addr, scatter_indice_addr, - output_size, num_act_out)); - - // call gather_nd out_indices_expand + grid_out_addr = indice_index_out - void *gather_input_addr = NULL, *gather_output_addr = NULL, - *gather_indice_addr = NULL; - gather_output_addr = - (void *)((char *)workspace + mask_all_ws + indice_index_in_ws); - gather_input_addr = scatter_output_addr; - gather_indice_addr = out_indices_expand_ptr; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - launchGatherNdOp(handle, interface_name, gather_input_addr, - gather_output_addr, gather_indice_addr, - input_active_site, kernel_volume, output_size)); - - // call launchDefaultKernel3 l k partition and sort - void *kernel3_input_addr = NULL, *kernel3_output_addr = NULL, - *kernel3_mask_addr = NULL; - kernel3_input_addr = indice_index_in_ptr; - kernel3_output_addr = indice_pairs; - kernel3_mask_addr = mask_all_ptr; - fill_value = -1; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - launchFillOp(handle, interface_name, indice_pairs, - kernel_volume * 2 * input_active_site, fill_value)); - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - launchDefaultKernel3(handle, kernel3_output_addr, - kernel3_input_addr, kernel3_mask_addr, - input_active_site, kernel_volume)); - - // get out_indices from indice unique - void *kernel4_output_addr = NULL, *kernel4_input_addr = NULL; - kernel4_input_addr = unique_output_addr; - kernel4_output_addr = out_indices; - INTERNAL_CHECK( - interface_name, - MLUOP_STATUS_SUCCESS == - launchDefaultKernel4(handle, sparse_conv_desc, kernel4_output_addr, - kernel4_input_addr, num_act_out)); - } - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t normalGetIndicePairs( - mluOpHandle_t handle, const std::string interface_name, - mluOpSparseConvolutionDescriptor_t sparse_conv_desc, - const mluOpTensorDescriptor_t indices_desc, const void *indices, - void *workspace, size_t workspace_size, - const mluOpTensorDescriptor_t indice_pairs_desc, void *indice_pairs, - const mluOpTensorDescriptor_t out_indices_desc, void *out_indices, - const mluOpTensorDescriptor_t indice_num_desc, void *indice_num, - const bool is_get_workspace, size_t *return_ws) { - if (is_get_workspace) { - return getNormalGetIndicePairsWorkspaceSize( - handle, interface_name, sparse_conv_desc, indices_desc, - indice_pairs_desc, out_indices_desc, indice_num_desc, return_ws); - } else { - return NormalGetIndicePairsKernel( - handle, interface_name, sparse_conv_desc, indices_desc, indices, - workspace, indice_pairs_desc, indice_pairs, out_indices_desc, - out_indices, indice_num_desc, indice_num); - } -} diff --git a/kernels/get_indice_pairs/normal_get_indice_pairs.h b/kernels/get_indice_pairs/normal_get_indice_pairs.h deleted file mode 100644 index a48a57b11..000000000 --- a/kernels/get_indice_pairs/normal_get_indice_pairs.h +++ /dev/null @@ -1,89 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#ifndef KERNELS_GET_INDICE_PAIRS_NORMAL_GET_INDICE_PAIRS_H_ -#define KERNELS_GET_INDICE_PAIRS_NORMAL_GET_INDICE_PAIRS_H_ - -#include - -#include "kernels/get_indice_pairs/get_indice_pairs_structs.h" -#include "mlu_op.h" - -mluOpStatus_t MLUOP_WIN_API KernelDefaultGetIndicePairKl1( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - void *mask_all_ws, void *indice_index_in_ws, void *indice_out_expand_ws, - void *indices, FilterSpace filter_space, InputSpace input_space, - OutputSpace output_space, Stride stride, Dilation dilation, Padding padding, - int32_t core_num_l, int32_t input_active_site, int32_t batch_size); - -mluOpStatus_t MLUOP_WIN_API KernelDefaultGetIndicePairKl2( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - void *step_index_ptr, int32_t num_act_out, int32_t core_num_l); - -mluOpStatus_t MLUOP_WIN_API KernelDefaultGetIndicePairKl3( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - void *indice_pairs, void *input_addr, void *mask_addr, - int32_t input_active_site, int32_t kernel_volume, int32_t core_num_l); - -mluOpStatus_t MLUOP_WIN_API KernelDefaultGetIndicePairKl4( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - void *out_indices, void *input_addr, OutputSpace host_output_space, - int32_t len_l, int32_t core_num_l); - -mluOpStatus_t MLUOP_WIN_API KernelBalanceGetIndicePair( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - void *balance_input, void *balance_mask, void *balance_output, - int32_t len_l, int32_t kernel_volume, int32_t core_num_l, - int32_t output_size); - -mluOpStatus_t MLUOP_WIN_API KernelSubmGetIndicePairKl1( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - void *mask_all_ptr, void *indice_index_in_ptr, void *indice_in_expand_ptr, - void *out_indices_expand_ptr, void *indices, FilterSpace host_filter_space, - InputSpace host_input_space, OutputSpace host_output_space, - Stride host_stride, Dilation host_dilation, Padding host_padding, - int32_t core_num_l, int32_t input_active_site, int32_t batch_size); - -mluOpStatus_t MLUOP_WIN_API KernelSubmGetIndicePairKl2( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - void *out_indices, void *mask_all_ptr, void *out_indices_expand_ptr, - void *indices, int32_t len_1_one, int32_t len_l_two, int32_t core_num_l_one, - int32_t core_num_l_two); - -mluOpStatus_t getNormalGetIndicePairsWorkspaceSize( - mluOpHandle_t handle, const std::string interface_name, - const mluOpSparseConvolutionDescriptor_t sparse_conv_desc, - const mluOpTensorDescriptor_t indices_desc, - const mluOpTensorDescriptor_t indice_pairs_desc, - const mluOpTensorDescriptor_t out_indices_desc, - const mluOpTensorDescriptor_t indice_num_desc, size_t *return_ws); - -mluOpStatus_t normalGetIndicePairs( - mluOpHandle_t handle, const std::string interface_name, - const mluOpSparseConvolutionDescriptor_t sparse_conv_desc, - const mluOpTensorDescriptor_t indices_desc, const void *indices, - void *workspace, size_t workspace_size, - const mluOpTensorDescriptor_t indice_pairs_desc, void *indice_pairs, - const mluOpTensorDescriptor_t out_indices_desc, void *out_indices, - const mluOpTensorDescriptor_t indice_num_desc, void *indice_num, - const bool is_get_workspace, size_t *return_ws); -#endif // KERNELS_GET_INDICE_PAIRS_NORMAL_GET_INDICE_PAIRS_H_ diff --git a/kernels/indice_convolution_backward_data/indice_convolution_backward_data.cpp b/kernels/indice_convolution_backward_data/indice_convolution_backward_data.cpp deleted file mode 100644 index ea0860f90..000000000 --- a/kernels/indice_convolution_backward_data/indice_convolution_backward_data.cpp +++ /dev/null @@ -1,904 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "kernels/indice_convolution_backward_data/indice_convolution_backward_data.h" - -#include -#include - -#include "core/context.h" -#include "core/gen_case.h" -#include "kernels/get_indice_pairs/get_indice_pairs_structs.h" -#include "kernels/utils/cnnl_helper.h" -#include "mlu_op.h" - -static mluOpStatus_t foolCheckNoPtr( - mluOpHandle_t handle, const mluOpTensorDescriptor_t output_grad_desc, - const mluOpTensorDescriptor_t filters_desc, - const mluOpTensorDescriptor_t indice_pairs_desc, const int64_t indice_num[], - const int64_t inverse, const int64_t sub_m, - const mluOpTensorDescriptor_t input_grad_desc, bool *is_zero) { - std::string api = "[mluOpIndiceConvolutionBackwardData]"; - // check nullptr - PARAM_CHECK(api, handle != NULL); - PARAM_CHECK(api, output_grad_desc != NULL); - PARAM_CHECK(api, filters_desc != NULL); - PARAM_CHECK(api, indice_pairs_desc != NULL); - PARAM_CHECK(api, input_grad_desc != NULL); - - // check platform - if (handle->arch < 372) { - LOG(ERROR) << api << " Only support hardware over MLU300 ."; - return MLUOP_STATUS_ARCH_MISMATCH; - } - - // check dim - PARAM_CHECK_EQ(api, output_grad_desc->dim, 2); - PARAM_CHECK(api, filters_desc->dim == 4 || filters_desc->dim == 5); - PARAM_CHECK_EQ(api, indice_pairs_desc->dim, 3); - PARAM_CHECK_EQ(api, input_grad_desc->dim, 2); - - // check shape - PARAM_CHECK(api, indice_pairs_desc->dims[1] == 2); - if (indice_pairs_desc->dims[2] > INDICE_IN_LARGE_TENSOR_NUM) { - LOG(ERROR) << api << " Check failed: " - << "indice_pairs_desc->dims[2] cannot be greater than " - << INDICE_IN_LARGE_TENSOR_NUM << "."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - - // check dtype - PARAM_CHECK(api, output_grad_desc->dtype == MLUOP_DTYPE_FLOAT || - output_grad_desc->dtype == MLUOP_DTYPE_HALF); - PARAM_CHECK(api, filters_desc->dtype == MLUOP_DTYPE_FLOAT || - filters_desc->dtype == MLUOP_DTYPE_HALF); - PARAM_CHECK(api, input_grad_desc->dtype == MLUOP_DTYPE_FLOAT || - input_grad_desc->dtype == MLUOP_DTYPE_HALF); - PARAM_CHECK(api, indice_pairs_desc->dtype == MLUOP_DTYPE_INT32); - - // check layout - bool layout_check = filters_desc->layout == MLUOP_LAYOUT_NHWC || - filters_desc->layout == MLUOP_LAYOUT_NCHW || - filters_desc->layout == MLUOP_LAYOUT_HWCN || - filters_desc->layout == MLUOP_LAYOUT_NCDHW || - filters_desc->layout == MLUOP_LAYOUT_NDHWC || - filters_desc->layout == MLUOP_LAYOUT_ARRAY; - if (!layout_check) { - LOG(ERROR) << api - << " The filters tensor only supports " - "NHWC/NCHW/HWCN/NCDHW/NDHWC/ARRAY layout."; - return MLUOP_STATUS_BAD_PARAM; - } - - // get filters params - int kd = 1, kh = 1, kw = 1, dyc = 1, dxc = 1; - if (filters_desc->layout != MLUOP_LAYOUT_ARRAY) { - kh = mluOpGetTensordimH(filters_desc); - kw = mluOpGetTensordimW(filters_desc); - dyc = mluOpGetTensordimN(filters_desc); - dxc = mluOpGetTensordimC(filters_desc); - if (filters_desc->dim == 5) { - kd = mluOpGetTensordimD(filters_desc); - } - } else { - if (filters_desc->dim == 5) { - kd = filters_desc->dims[0]; - } - int _dim = filters_desc->dim; - kh = filters_desc->dims[_dim - 4]; - kw = filters_desc->dims[_dim - 3]; - dxc = filters_desc->dims[_dim - 2]; - dyc = filters_desc->dims[_dim - 1]; - } - int K = kd * kh * kw; - - // check param - PARAM_CHECK(api, inverse == 0 || inverse == 1); - PARAM_CHECK(api, sub_m == 0 || sub_m == 1); - for (int kk = 0; kk < K; ++kk) { - PARAM_CHECK(api, indice_num[kk] >= 0); - } - if (inverse == 1) { - LOG(ERROR) << api << " Not support inverse == 1 yet."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - - // check algorithm, relationship between params - if (K != indice_pairs_desc->dims[0]) { - LOG(ERROR) << api - << " The dims[0] of indice_pairs should be equal to the " - "multiple of kd, kh and kw."; - return MLUOP_STATUS_BAD_PARAM; - } - if (output_grad_desc->dims[1] != dyc) { - LOG(ERROR) << api - << " The dims[1] of output_grad should be equal to dyc of " - "filters tensor."; - return MLUOP_STATUS_BAD_PARAM; - } - if (input_grad_desc->dims[1] != dxc) { - LOG(ERROR) << api - << " The dims[1] of input_grad should be equal to dxc of " - "filters tensor."; - return MLUOP_STATUS_BAD_PARAM; - } - if (input_grad_desc->dims[0] != indice_pairs_desc->dims[2]) { - LOG(ERROR) << api - << " The dims[0] of input_grad should be equal to the dims[2] " - "of indice_pairs."; - return MLUOP_STATUS_BAD_PARAM; - } - int max_indice_num = getMaxNumInArray(indice_num, K); - - if (indice_pairs_desc->dims[2] < max_indice_num) { - VLOG(5) << "indice_pairs_desc->dims[2] " << indice_pairs_desc->dims[2] - << " max_indice_num " << max_indice_num; - LOG(ERROR) << api - << " The data in indice_num array should be smaller or equal to" - << " the dims[2] of indice_pairs."; - return MLUOP_STATUS_BAD_PARAM; - } - if (sub_m == 1) { - if (input_grad_desc->dims[0] != output_grad_desc->dims[0]) { - LOG(ERROR) << api - << " The dims[0] of input_grad should be equal to the dims[0]" - << " of output_grad when sub_m is 1."; - return MLUOP_STATUS_BAD_PARAM; - } - - if (indice_num[K / 2] < max_indice_num) { - LOG(ERROR) << api - << " The middle number of the indice_num array should be the " - << "maximum of the array when sub_m is 1. Now the maximum is " - << max_indice_num << " while the middle number of the array " - << "is " << indice_num[K / 2] << "."; - return MLUOP_STATUS_BAD_PARAM; - } - } - - if (output_grad_desc->dims[0] < max_indice_num) { - LOG(ERROR) - << api - << " The dims[0] of output_grad should be larger than or equal to the" - << " maximum number of indice_num."; - return MLUOP_STATUS_BAD_PARAM; - } - - if (sub_m == 1 && K % 2 == 0) { - LOG(ERROR) << api << " When sub_m value is 1, the filters dims (Kd, Kh & " - << "Kw) should be odd numbers."; - return MLUOP_STATUS_BAD_PARAM; - } - - PARAM_CHECK(api, output_grad_desc->dtype == input_grad_desc->dtype); - PARAM_CHECK(api, output_grad_desc->dtype == filters_desc->dtype); - - // check constraints: not support large tensor - uint64_t input_grad_count = mluOpGetTensorElementNum(input_grad_desc); - TENSOR_NUM_CHECK(api, input_grad_count, LARGE_TENSOR_NUM, - "input_grad tensor num is too large. "); - uint64_t output_grad_count = mluOpGetTensorElementNum(output_grad_desc); - TENSOR_NUM_CHECK(api, output_grad_count, LARGE_TENSOR_NUM, - "output_grad tensor num is too large. "); - uint64_t filter_count = mluOpGetTensorElementNum(filters_desc); - TENSOR_NUM_CHECK(api, filter_count, LARGE_TENSOR_NUM, - "filters tensor num is too large. "); - uint64_t indice_pairs_count = mluOpGetTensorElementNum(indice_pairs_desc); - TENSOR_NUM_CHECK(api, indice_pairs_count, LARGE_TENSOR_NUM, - "indice_pairs tensor num is too large. "); - - // check zero element - if (input_grad_count == 0) { - LOG(INFO) << "input_grad is a zero-element tensor."; - *is_zero = true; - return MLUOP_STATUS_SUCCESS; - } - if (output_grad_count == 0) { - LOG(INFO) << "output_grad is a zero-element tensor."; - *is_zero = true; - return MLUOP_STATUS_SUCCESS; - } - if (filter_count == 0) { - LOG(INFO) << "filters is a zero-element tensor."; - *is_zero = true; - return MLUOP_STATUS_SUCCESS; - } - if (indice_pairs_count == 0) { - LOG(INFO) << "indice_pairs is a zero-element tensor."; - *is_zero = true; - return MLUOP_STATUS_SUCCESS; - } - return MLUOP_STATUS_SUCCESS; -} - -static void getPermuteArray(const mluOpTensorLayout_t filter_layout, - int *permute) { - // transpose to (D)HWCN, (kd-)kh-kw-dxc-dyc - switch (filter_layout) { - case MLUOP_LAYOUT_NHWC: { - permute[0] = 1; - permute[1] = 2; - permute[2] = 3; - permute[3] = 0; - }; break; - case MLUOP_LAYOUT_NCHW: { - permute[0] = 2; - permute[1] = 3; - permute[2] = 1; - permute[3] = 0; - }; break; - case MLUOP_LAYOUT_NDHWC: { - permute[0] = 1; - permute[1] = 2; - permute[2] = 3; - permute[3] = 4; - permute[4] = 0; - }; break; - case MLUOP_LAYOUT_NCDHW: { - permute[0] = 2; - permute[1] = 3; - permute[2] = 4; - permute[3] = 1; - permute[4] = 0; - }; break; - case MLUOP_LAYOUT_HWCN: - default: - break; - } -} - -static mluOpStatus_t foolCheck( - mluOpHandle_t handle, const mluOpTensorDescriptor_t output_grad_desc, - const void *output_grad, const mluOpTensorDescriptor_t filters_desc, - const void *filters, const mluOpTensorDescriptor_t indice_pairs_desc, - const void *indice_pairs, const int64_t indice_num[], const int64_t inverse, - const int64_t sub_m, void *workspace, size_t workspace_size, - const mluOpTensorDescriptor_t input_grad_desc, void *input_grad, - bool *is_zero) { - std::string api = "[mluOpIndiceConvolutionBackwardData]"; - mluOpStatus_t ret = - foolCheckNoPtr(handle, output_grad_desc, filters_desc, indice_pairs_desc, - indice_num, inverse, sub_m, input_grad_desc, is_zero); - if (ret != MLUOP_STATUS_SUCCESS) { - return ret; - } - if (*is_zero) { - return MLUOP_STATUS_SUCCESS; - } - - // check workspace & other space - PARAM_CHECK(api, output_grad != NULL); - PARAM_CHECK(api, filters != NULL); - PARAM_CHECK(api, indice_pairs != NULL); - PARAM_CHECK(api, input_grad != NULL); - if (workspace_size > 0) { - PARAM_CHECK(api, workspace != NULL); - } - return MLUOP_STATUS_SUCCESS; -} - -static void spconvbpdataGencase( - mluOpHandle_t handle, const mluOpTensorDescriptor_t output_grad_desc, - const void *output_grad, const mluOpTensorDescriptor_t filters_desc, - const void *filters, const mluOpTensorDescriptor_t indice_pairs_desc, - const void *indice_pairs, const int64_t indice_num[], const int64_t inverse, - const int64_t sub_m, void *workspace, size_t workspace_size, - const mluOpTensorDescriptor_t input_grad_desc, void *input_grad) { - GEN_CASE_START("indice_convolution_backward_data"); - GEN_CASE_HANDLE(handle); - GEN_CASE_DATA_REAL(true, "output_grad", output_grad, output_grad_desc); - GEN_CASE_DATA_REAL(true, "filters", filters, filters_desc); - GEN_CASE_DATA_REAL(true, "indice_pairs_desc", indice_pairs, - indice_pairs_desc); - GEN_CASE_DATA_REAL(false, "input_grad", input_grad, input_grad_desc); - GEN_CASE_OP_PARAM_SINGLE(0, "indice_convolution_backward_data", "inverse", - inverse); - GEN_CASE_OP_PARAM_SINGLE(1, "indice_convolution_backward_data", "sub_m", - sub_m); - GEN_CASE_OP_PARAM_ARRAY(1, "indice_convolution_backward_data", "indice_num", - indice_num, indice_pairs_desc->dims[0]); - GEN_CASE_HANDLE_PARAM(); - GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0); -} - -/* - * [output_grad] [filters] - * | | - * | cnnlGatherNd() | cnnlTranspose_v2() - * | | - * V V - * [output_grad_condence] [filter_transpose] - * |_______________________| - * | - * | cnnlMatMul_v2() - * | - * V - * [input_grad_condence] - * | - * | cnnlScatterNd_v2(CNNL_SCATTERND_UPDATE) - * | - * V - * [workspace_input_grad_tmp] - * | - * | cnnlAddN_v2() - * | - * V - * [input_grad] - */ -mluOpStatus_t MLUOP_WIN_API mluOpGetIndiceConvolutionBackwardDataWorkspaceSize( - mluOpHandle_t handle, const mluOpTensorDescriptor_t output_grad_desc, - const mluOpTensorDescriptor_t filters_desc, - const mluOpTensorDescriptor_t indice_pairs_desc, - const mluOpTensorDescriptor_t input_grad_desc, const int64_t indice_num[], - const int64_t inverse, size_t *workspace_size) { - bool is_zero_element = false; - if (workspace_size == NULL) { - LOG(ERROR) << "[mluOpGetIndiceConvolutionBackwardDataWorkspaceSize] " - << "The pointer workspace_size should not be nullptr."; - return MLUOP_STATUS_BAD_PARAM; - } - mluOpStatus_t ret = - foolCheckNoPtr(handle, output_grad_desc, filters_desc, indice_pairs_desc, - indice_num, inverse, 0, input_grad_desc, &is_zero_element); - if (ret != MLUOP_STATUS_SUCCESS) { - return ret; - } - if (is_zero_element) { - return MLUOP_STATUS_SUCCESS; - } - - int kd = 1, kh = 1, kw = 1, dyc = 1, dxc = 1; - if (filters_desc->layout != MLUOP_LAYOUT_ARRAY) { - kh = mluOpGetTensordimH(filters_desc); - kw = mluOpGetTensordimW(filters_desc); - dyc = mluOpGetTensordimN(filters_desc); - dxc = mluOpGetTensordimC(filters_desc); - if (filters_desc->dim == 5) { - kd = mluOpGetTensordimD(filters_desc); - } - } else { - if (filters_desc->dim == 5) { - kd = filters_desc->dims[0]; - } - int _dim = filters_desc->dim; - kh = filters_desc->dims[_dim - 4]; - kw = filters_desc->dims[_dim - 3]; - dxc = filters_desc->dims[_dim - 2]; - dyc = filters_desc->dims[_dim - 1]; - } - int K = kd * kh * kw; - int max_indice_num = getMaxNumInArray(indice_num, K); - uint64_t filter_transpose_size = 0; - uint64_t transpose_workspace_size = 0; - uint64_t output_grad_condence_size = 0; - uint64_t input_grad_condence_size = 0; - uint64_t matmul_workspace_size = 0; - if (!(filters_desc->layout == MLUOP_LAYOUT_HWCN || - filters_desc->layout == MLUOP_LAYOUT_ARRAY)) { - filter_transpose_size = mluOpGetTensorElementNum(filters_desc) * - mluOpDataTypeBytes(filters_desc->dtype); - // get cnnlTranspose_v2 workspace workspace_size - size_t transpose_workspace_size_ = 0; - cnnlTransposeDescriptor_t trans_desc; - CALL_CNNL(cnnlCreateTransposeDescriptor(&trans_desc)); - int permute[5] = {0, 1, 2, 3, 4}; - getPermuteArray(filters_desc->layout, permute); - CALL_CNNL( - cnnlSetTransposeDescriptor(trans_desc, filters_desc->dim, permute)); - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filters_desc, cnnl_x_desc); - CALL_CNNL(cnnlGetTransposeWorkspaceSize( - cnnl_handle, cnnl_x_desc, trans_desc, &transpose_workspace_size_)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - CALL_CNNL(cnnlDestroyTransposeDescriptor(trans_desc)); - transpose_workspace_size = (uint64_t)transpose_workspace_size_; - } - output_grad_condence_size = max_indice_num * output_grad_desc->dims[1] * - mluOpDataTypeBytes(filters_desc->dtype); - input_grad_condence_size = max_indice_num * input_grad_desc->dims[1] * - mluOpDataTypeBytes(filters_desc->dtype); - - // matmul workspace - { - mluOpTensorDescriptor_t sub_filters_desc; - mluOpTensorDescriptor_t output_grad_condence_desc; - mluOpTensorDescriptor_t input_grad_condence_desc; - - cnnlMatMulDescriptor_t cnnl_matmul_desc; - cnnlMatMulHeuristicResult_t cnnl_heuristic_result; - cnnlMatMulAlgo_t cnnl_matmul_algo; - - MLUOP_CHECK(mluOpCreateTensorDescriptor(&sub_filters_desc)); - // MLUOP_CHECK(mluOpCreateTensorDescriptor(&sub_filters_desc)); - int sub_filter_dims[2] = {(int)(dxc), (int)(dyc)}; - MLUOP_CHECK(mluOpSetTensorDescriptor(sub_filters_desc, MLUOP_LAYOUT_ARRAY, - filters_desc->dtype, 2, - sub_filter_dims)); - int is_trans_a = 0, is_trans_b = 1; - int tf32_flag_int = 0; - CALL_CNNL(cnnlMatMulDescCreate(&cnnl_matmul_desc)); - CALL_CNNL(cnnlSetMatMulDescAttr(cnnl_matmul_desc, CNNL_MATMUL_DESC_TRANSA, - &(is_trans_a), sizeof(is_trans_a))); - CALL_CNNL(cnnlSetMatMulDescAttr(cnnl_matmul_desc, CNNL_MATMUL_DESC_TRANSB, - &(is_trans_b), sizeof(is_trans_b))); - CALL_CNNL(cnnlSetMatMulDescAttr(cnnl_matmul_desc, CNNL_MATMUL_ALLOW_TF32, - &(tf32_flag_int), sizeof(tf32_flag_int))); - MLUOP_CHECK(mluOpCreateTensorDescriptor(&output_grad_condence_desc)); - int output_grad_condence_dims[2] = {(int)(max_indice_num), (int)(dyc)}; - MLUOP_CHECK(mluOpSetTensorDescriptor( - output_grad_condence_desc, MLUOP_LAYOUT_ARRAY, output_grad_desc->dtype, - 2, output_grad_condence_dims)); - MLUOP_CHECK(mluOpCreateTensorDescriptor(&input_grad_condence_desc)); - int input_grad_condence_dims[2] = {(int)(max_indice_num), (int)(dxc)}; - MLUOP_CHECK(mluOpSetTensorDescriptor( - input_grad_condence_desc, MLUOP_LAYOUT_ARRAY, input_grad_desc->dtype, 2, - input_grad_condence_dims)); - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(sub_filters_desc, - cnnl_sub_filters_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR( - output_grad_condence_desc, cnnl_output_grad_condence_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_condence_desc, - cnnl_input_grad_condence_desc); - - CALL_CNNL(cnnlCreateMatMulHeuristicResult(&cnnl_heuristic_result)); - CALL_CNNL(cnnlMatMulAlgoCreate(&cnnl_matmul_algo)); - - // set matmul heuristic_result & algorithm - int requested_algo_count = 1, return_algo_count = 0; - CALL_CNNL(cnnlGetMatMulAlgoHeuristic( - cnnl_handle, cnnl_matmul_desc, cnnl_output_grad_condence_desc, - cnnl_sub_filters_desc, cnnl_input_grad_condence_desc, - cnnl_input_grad_condence_desc, NULL, requested_algo_count, - &cnnl_heuristic_result, &return_algo_count)); - - // launch matmul - size_t workspace_size_matmul = 0; - float alpha_gemm = 1.0f, beta_gemm = 0.0f; - CALL_CNNL(cnnlGetMatMulHeuristicResult( - cnnl_heuristic_result, cnnl_matmul_algo, &workspace_size_matmul)); - - // destroy descriptors - CALL_CNNL(cnnlDestroyMatMulHeuristicResult(cnnl_heuristic_result)); - CALL_CNNL(cnnlMatMulDescDestroy(cnnl_matmul_desc)); - CALL_CNNL(cnnlMatMulAlgoDestroy(cnnl_matmul_algo)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_grad_condence_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_sub_filters_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_grad_condence_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - - MLUOP_CHECK(mluOpDestroyTensorDescriptor(output_grad_condence_desc)); - MLUOP_CHECK(mluOpDestroyTensorDescriptor(sub_filters_desc)); - MLUOP_CHECK(mluOpDestroyTensorDescriptor(input_grad_condence_desc)); - matmul_workspace_size = (uint64_t)workspace_size_matmul; - } - // scatter to input_grad_tmp_workspace_size workspace - uint64_t input_grad_tmp_workspace_size = - mluOpGetTensorElementNum(input_grad_desc) * - mluOpDataTypeBytes(input_grad_desc->dtype); - - // addn workspace - uint32_t addn_num = 2; - size_t addn_workspace_size = 0; - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - cnnlTensorDescriptor_t *cnnl_input_descs = (cnnlTensorDescriptor_t *)malloc( - sizeof(cnnlTensorDescriptor_t) * addn_num); - for (int i = 0; i < addn_num; i++) { - CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_desc, - cnnl_input_descs[i]); - } - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_desc, - cnnl_output_desc); - CHECK_FUNC_RETURN( - cnnlGetAddNWorkspaceSize(cnnl_handle, cnnl_input_descs, addn_num, - cnnl_output_desc, &addn_workspace_size), - CNNL_STATUS_SUCCESS, - "[cnnlAddN_v2] Internal error accured in cnnlGetAddNWorkspaceSize.", - MLUOP_STATUS_INTERNAL_ERROR); - for (int i = 0; i < addn_num; i++) { - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_descs[i]); - } - free(cnnl_input_descs); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - - *workspace_size = - (size_t)(filter_transpose_size + transpose_workspace_size + - output_grad_condence_size + input_grad_condence_size + - matmul_workspace_size + input_grad_tmp_workspace_size + - addn_workspace_size); - VLOG(5) << "[mluOpIndiceConvolutionBackwardData] filter_transpose_size=" - << filter_transpose_size - << ", transpose_workspace_size=" << transpose_workspace_size - << ", output_grad_condence_size=" << output_grad_condence_size - << ", input_grad_condence_size=" << input_grad_condence_size - << ", matmul_workspace_size=" << matmul_workspace_size - << ", input_grad_tmp_workspace_size=" << input_grad_tmp_workspace_size - << ", addn_workspace_size=" << addn_workspace_size; - VLOG(5) << "[mluOpIndiceConvolutionBackwardData] workspace workspace_size: " - << *workspace_size; - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpIndiceConvolutionBackwardData( - mluOpHandle_t handle, const mluOpTensorDescriptor_t output_grad_desc, - const void *output_grad, const mluOpTensorDescriptor_t filters_desc, - const void *filters, const mluOpTensorDescriptor_t indice_pairs_desc, - const void *indice_pairs, const int64_t indice_num[], const int64_t inverse, - const int64_t sub_m, void *workspace, const size_t workspace_size, - const mluOpTensorDescriptor_t input_grad_desc, void *input_grad) { - // fool check - { - bool is_zero_element = false; - mluOpStatus_t ret = foolCheck( - handle, output_grad_desc, output_grad, filters_desc, filters, - indice_pairs_desc, indice_pairs, indice_num, inverse, sub_m, workspace, - workspace_size, input_grad_desc, input_grad, &is_zero_element); - if (ret != MLUOP_STATUS_SUCCESS) { - return ret; - } - if (is_zero_element) { - return MLUOP_STATUS_SUCCESS; - } - } - - // gen_case - if (MLUOP_GEN_CASE_ON_NEW) { - spconvbpdataGencase(handle, output_grad_desc, output_grad, filters_desc, - filters, indice_pairs_desc, indice_pairs, indice_num, - inverse, sub_m, workspace, workspace_size, - input_grad_desc, input_grad); - } - - // get filters params - int kd = 1, kh = 1, kw = 1, dyc = 1, dxc = 1; - if (filters_desc->layout != MLUOP_LAYOUT_ARRAY) { - kh = mluOpGetTensordimH(filters_desc); - kw = mluOpGetTensordimW(filters_desc); - dyc = mluOpGetTensordimN(filters_desc); - dxc = mluOpGetTensordimC(filters_desc); - if (filters_desc->dim == 5) { - kd = mluOpGetTensordimD(filters_desc); - } - } else { - if (filters_desc->dim == 5) { - kd = filters_desc->dims[0]; - } - int _dim = filters_desc->dim; - kh = filters_desc->dims[_dim - 4]; - kw = filters_desc->dims[_dim - 3]; - dxc = filters_desc->dims[_dim - 2]; - dyc = filters_desc->dims[_dim - 1]; - } - int K = kd * kh * kw; - int cal_dwidth = mluOpDataTypeBytes(filters_desc->dtype); - uint64_t filter_transpose_size = 0, output_grad_condence_size = 0, - input_grad_condence_size = 0; - if (!(filters_desc->layout == MLUOP_LAYOUT_HWCN)) { - filter_transpose_size = mluOpGetTensorElementNum(filters_desc) * cal_dwidth; - VLOG(5) << "host invoke: filter_transpose_size " << filter_transpose_size; - } - output_grad_condence_size = - getMaxNumInArray(indice_num, K) * output_grad_desc->dims[1] * cal_dwidth; - input_grad_condence_size = - getMaxNumInArray(indice_num, K) * input_grad_desc->dims[1] * cal_dwidth; - char *filter_transpose = (char *)filters; - char *workspace_base = (char *)workspace; - - // transpose filters to layout XHWCN - mluOpTensorDescriptor_t filter_transpose_desc; - if (filters_desc->layout != MLUOP_LAYOUT_HWCN && - filters_desc->layout != MLUOP_LAYOUT_ARRAY) { - filter_transpose = (char *)workspace; - workspace_base += filter_transpose_size; - cnnlTransposeDescriptor_t trans_desc; - MLUOP_CHECK(mluOpCreateTensorDescriptor(&filter_transpose_desc)); - CALL_CNNL(cnnlCreateTransposeDescriptor(&trans_desc)); - int permute[5] = {0, 1, 2, 3, 4}; - int filter_transpose_dims[5]; - getPermuteArray(filters_desc->layout, permute); - for (int i = 0; i < filters_desc->dim; ++i) { - filter_transpose_dims[i] = filters_desc->dims[permute[i]]; - VLOG(5) << "permute " << permute[i]; - } - MLUOP_CHECK(mluOpSetTensorDescriptor( - filter_transpose_desc, MLUOP_LAYOUT_ARRAY, filters_desc->dtype, - filters_desc->dim, filter_transpose_dims)); - CALL_CNNL( - cnnlSetTransposeDescriptor(trans_desc, filters_desc->dim, permute)); - size_t transpose_workspace_size = 0; - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filters_desc, cnnl_x_desc); - CALL_CNNL(cnnlGetTransposeWorkspaceSize( - cnnl_handle, cnnl_x_desc, trans_desc, &transpose_workspace_size)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - char *transpose_workspace = workspace_base; - workspace_base += transpose_workspace_size; - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filters_desc, cnnl_x_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filter_transpose_desc, - cnnl_y_desc); - CALL_CNNL(cnnlTranspose_v2( - cnnl_handle, trans_desc, cnnl_x_desc, filters, cnnl_y_desc, - filter_transpose, transpose_workspace, transpose_workspace_size)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_y_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - CALL_CNNL(cnnlDestroyTransposeDescriptor(trans_desc)); - MLUOP_CHECK(mluOpDestroyTensorDescriptor(filter_transpose_desc)); - } else { - filter_transpose_desc = filters_desc; - } - char *output_grad_condence = workspace_base; - workspace_base += output_grad_condence_size; - char *input_grad_condence = workspace_base; - workspace_base += input_grad_condence_size; - - // filters calculate desc - mluOpTensorDescriptor_t sub_filters_desc; - MLUOP_CHECK(mluOpCreateTensorDescriptor(&sub_filters_desc)); - int sub_filter_dims[2] = {(int)(dxc), (int)(dyc)}; - MLUOP_CHECK(mluOpSetTensorDescriptor(sub_filters_desc, MLUOP_LAYOUT_ARRAY, - filters_desc->dtype, 2, - sub_filter_dims)); - float fill_value = 0; - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_desc, - cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value, - cnnl_output_desc, input_grad)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - - void *workspace_matmul = NULL; - char *workspace_input_grad_tmp = NULL; - char *workspace_addn = NULL; - - // filters DHW dim loop - int kk_count = 0; - for (size_t kk = 0; kk < K; ++kk) { - VLOG(5) << "indice_num " << indice_num[kk]; - if (indice_num[kk] == 0) { - continue; - } - const int int_dwidth = 4; - char *sub_filter = filter_transpose + kk * dyc * dxc * cal_dwidth; - - // gather output_grad - mluOpTensorDescriptor_t gather_indices_desc; - mluOpTensorDescriptor_t output_grad_condence_desc; - MLUOP_CHECK(mluOpCreateTensorDescriptor(&gather_indices_desc)); - int gather_indices_dims[2] = {(int)(indice_num[kk]), (int)(1)}; - MLUOP_CHECK(mluOpSetTensorDescriptor(gather_indices_desc, - MLUOP_LAYOUT_ARRAY, MLUOP_DTYPE_INT32, - 2, gather_indices_dims)); - MLUOP_CHECK(mluOpCreateTensorDescriptor(&output_grad_condence_desc)); - int output_grad_condence_dims[2] = {(int)(indice_num[kk]), (int)(dyc)}; - MLUOP_CHECK(mluOpSetTensorDescriptor( - output_grad_condence_desc, MLUOP_LAYOUT_ARRAY, output_grad_desc->dtype, - 2, output_grad_condence_dims)); - uint64_t gather_indices_offset = - (kk * 2 + 1) * int(indice_pairs_desc->dims[2]) * int_dwidth; - char *gather_indices = - (char *)(const_cast(indice_pairs)) + gather_indices_offset; - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_grad_desc, - cnnl_params_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(gather_indices_desc, - cnnl_indices_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_grad_condence_desc, - cnnl_output_desc); - CALL_CNNL(cnnlGatherNd(cnnl_handle, cnnl_params_desc, output_grad, - cnnl_indices_desc, gather_indices, - cnnl_output_desc, output_grad_condence)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_params_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - - // matmul - cnnlMatMulDescriptor_t matmul_desc; - int is_trans_a = 0, is_trans_b = 1; - int tf32_flag_int = 0; - CALL_CNNL(cnnlMatMulDescCreate(&matmul_desc)); - CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_TRANSA, - &(is_trans_a), sizeof(is_trans_a))); - CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_TRANSB, - &(is_trans_b), sizeof(is_trans_b))); - CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_ALLOW_TF32, - &(tf32_flag_int), sizeof(tf32_flag_int))); - mluOpTensorDescriptor_t input_grad_condence_desc; - MLUOP_CHECK(mluOpCreateTensorDescriptor(&input_grad_condence_desc)); - int input_grad_condence_dims[2] = {(int)(indice_num[kk]), (int)(dxc)}; - MLUOP_CHECK(mluOpSetTensorDescriptor( - input_grad_condence_desc, MLUOP_LAYOUT_ARRAY, input_grad_desc->dtype, 2, - input_grad_condence_dims)); - cnnlMatMulHeuristicResult_t heuristic_result; - CALL_CNNL(cnnlCreateMatMulHeuristicResult(&heuristic_result)); - cnnlMatMulAlgo_t matmul_algo; - CALL_CNNL(cnnlMatMulAlgoCreate(&matmul_algo)); - - // set matmul heuristic_result & algorithm - int requested_algo_count = 1, return_algo_count = 0; - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_grad_condence_desc, - cnnl_a_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(sub_filters_desc, - cnnl_b_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_condence_desc, - cnnl_c_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_condence_desc, - cnnl_d_desc); - CALL_CNNL(cnnlGetMatMulAlgoHeuristic( - cnnl_handle, matmul_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, - cnnl_d_desc, NULL, requested_algo_count, &heuristic_result, - &return_algo_count)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_b_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_c_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_d_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - - // launch matmul - size_t workspace_size_matmul = 0; - float alpha_gemm = 1.0f, beta_gemm = 0.0f; - CALL_CNNL(cnnlGetMatMulHeuristicResult(heuristic_result, matmul_algo, - &workspace_size_matmul)); - if (kk_count == 0) { - workspace_matmul = workspace_size_matmul == 0 - ? NULL - : reinterpret_cast(workspace_base); - workspace_base += workspace_size_matmul; - } - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_grad_condence_desc, - cnnl_a_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(sub_filters_desc, - cnnl_b_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_condence_desc, - cnnl_c_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_condence_desc, - cnnl_d_desc); - CALL_CNNL(cnnlMatMul_v2( - cnnl_handle, matmul_desc, matmul_algo, &alpha_gemm, cnnl_a_desc, - output_grad_condence, cnnl_b_desc, sub_filter, &beta_gemm, - cnnl_c_desc, input_grad_condence, workspace_matmul, - workspace_size_matmul, cnnl_d_desc, input_grad_condence)); - - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_b_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_c_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_d_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - // destroy descriptors - CALL_CNNL(cnnlDestroyMatMulHeuristicResult(heuristic_result)); - CALL_CNNL(cnnlMatMulDescDestroy(matmul_desc)); - CALL_CNNL(cnnlMatMulAlgoDestroy(matmul_algo)); - - // fill workspace_input_grad_tmp - uint64_t input_grad_tmp_workspace_size = - mluOpGetTensorElementNum(input_grad_desc) * - mluOpDataTypeBytes(input_grad_desc->dtype); - if (kk_count == 0) { - workspace_input_grad_tmp = workspace_base; - workspace_base += input_grad_tmp_workspace_size; - } - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_desc, - cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value, - cnnl_output_desc, workspace_input_grad_tmp)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - - // scatter input_grad - uint64_t scatter_indices_offset = - (kk * 2) * int(indice_pairs_desc->dims[2]) * int_dwidth; - char *scatter_indices = - (char *)(const_cast(indice_pairs)) + scatter_indices_offset; - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(gather_indices_desc, - cnnl_indices_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_condence_desc, - cnnl_updates_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_desc, - cnnl_input_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_desc, - cnnl_output_desc); - - CALL_CNNL(cnnlScatterNd_v2(cnnl_handle, CNNL_SCATTERND_UPDATE, - cnnl_indices_desc, scatter_indices, - cnnl_updates_desc, input_grad_condence, - cnnl_input_desc, workspace_input_grad_tmp, - cnnl_output_desc, workspace_input_grad_tmp)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_updates_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - - // add workspace_input_grad_tmp tensor back to input_grad - if (kk_count == 0) { - workspace_addn = workspace_base; - } - void *addn_array[2] = {reinterpret_cast(workspace_input_grad_tmp), - input_grad}; - size_t addn_workspace_size = 0; - uint32_t addn_num = 2; - - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - cnnlTensorDescriptor_t *cnnl_input_descs = - (cnnlTensorDescriptor_t *)malloc(sizeof(cnnlTensorDescriptor_t) * - addn_num); - for (int i = 0; i < addn_num; i++) { - CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_desc, - cnnl_input_descs[i]); - } - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_desc, - cnnl_output_desc); - CHECK_FUNC_RETURN( - cnnlGetAddNWorkspaceSize(cnnl_handle, cnnl_input_descs, addn_num, - cnnl_output_desc, &addn_workspace_size), - CNNL_STATUS_SUCCESS, - "[cnnlAddN_v2] Internal error accured in cnnlGetAddNWorkspaceSize.", - MLUOP_STATUS_INTERNAL_ERROR); - - CALL_CNNL(cnnlAddN_v2(cnnl_handle, cnnl_input_descs, addn_array, addn_num, - cnnl_output_desc, input_grad, workspace_addn, - addn_workspace_size)); - for (int i = 0; i < addn_num; i++) { - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_descs[i]); - } - free(cnnl_input_descs); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - - MLUOP_CHECK(mluOpDestroyTensorDescriptor(input_grad_condence_desc)); - MLUOP_CHECK(mluOpDestroyTensorDescriptor(gather_indices_desc)); - MLUOP_CHECK(mluOpDestroyTensorDescriptor(output_grad_condence_desc)); - kk_count++; - } - MLUOP_CHECK(mluOpDestroyTensorDescriptor(sub_filters_desc)); - GEN_CASE_END(); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/indice_convolution_backward_data/indice_convolution_backward_data.h b/kernels/indice_convolution_backward_data/indice_convolution_backward_data.h deleted file mode 100644 index e8be23270..000000000 --- a/kernels/indice_convolution_backward_data/indice_convolution_backward_data.h +++ /dev/null @@ -1,36 +0,0 @@ -/******************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS self.tcp LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *******************************************************************************/ -#ifndef KERNELS_INDICE_CONVOLUTION_BACKWARD_DATA_INDICE_CONVOLUTION_BACKWARD_DATA_H_ // NOLINT -#define KERNELS_INDICE_CONVOLUTION_BACKWARD_DATA_INDICE_CONVOLUTION_BACKWARD_DATA_H_ // NOLINT - -#include "core/tensor.h" - -inline int getMaxNumInArray(const int64_t arr[], const int num) { - int max_num = (int)(arr[0]); - for (int i = 1; i < num; ++i) { - max_num = max_num > (int)(arr[i]) ? max_num : (int)(arr[i]); - } - return max_num; -} - -#endif // KERNELS_INDICE_CONVOLUTION_BACKWARD_DATA_INDICE_CONVOLUTION_BACKWARD_DATA_H_ // NOLINT diff --git a/kernels/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp b/kernels/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp deleted file mode 100644 index ee33eb38d..000000000 --- a/kernels/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp +++ /dev/null @@ -1,605 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include -#include - -#include "core/context.h" -#include "core/gen_case.h" -#include "core/logging.h" -#include "core/mlu_env.h" -#include "core/tensor.h" -#include "kernels/get_indice_pairs/get_indice_pairs_structs.h" -#include "kernels/utils/cnnl_helper.h" -#include "mlu_op.h" - -inline bool isFloatDtype(const mluOpDataType_t &dtype) { - return (dtype == MLUOP_DTYPE_HALF || dtype == MLUOP_DTYPE_FLOAT); -} - -inline mluOpDataType_t getOnchipDataType( - const mluOpTensorDescriptor_t tensor_desc) { - if (tensor_desc->onchip_dtype != MLUOP_DTYPE_INVALID) { - return tensor_desc->onchip_dtype; - } else { - return tensor_desc->dtype; - } -} - -inline mluOpStatus_t setMatmulDescInfo(const std::string api_name, - cnnlMatMulDescriptor_t matmul_desc, - const uint32_t is_trans_a_value, - const uint32_t is_trans_b_value, - const uint32_t compute_dtype, - const uint32_t allow_tf32) { - CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_TRANSA, - &is_trans_a_value, sizeof(int32_t))); - CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_TRANSB, - &is_trans_b_value, sizeof(int32_t))); - CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_COMPUTE_TYPE, - &compute_dtype, sizeof(int32_t))); - CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_ALLOW_TF32, - &(allow_tf32), sizeof(int32_t))); - return MLUOP_STATUS_SUCCESS; -} - -inline std::string getTensorShapeString(const mluOpTensorDescriptor_t desc) { - std::string res; - res.push_back('['); - for (int32_t i = 0; i < desc->dim - 1; i++) { - res.append(std::to_string(desc->dims[i]) + ','); - } - res.append(std::to_string(desc->dims[desc->dim - 1]) + ']'); - return res; -} - -static void indiceConvFilterGencase( - mluOpHandle_t handle, const mluOpTensorDescriptor_t features_desc, - const void *features, const mluOpTensorDescriptor_t output_grad_desc, - const void *output_grad, const mluOpTensorDescriptor_t indice_pairs_desc, - const void *indice_pairs, const int64_t indice_num[], const int64_t inverse, - const int64_t subm, void *workspace, size_t workspace_size, - const mluOpTensorDescriptor_t filters_grad_desc, void *filters_grad) { - GEN_CASE_START("indice_convolution_backward_filter"); - GEN_CASE_HANDLE(handle); - GEN_CASE_DATA_REAL(true, "features", features, features_desc); - GEN_CASE_DATA_REAL(true, "output_grad", output_grad, output_grad_desc); - GEN_CASE_DATA_REAL(true, "indice_pairs_desc", indice_pairs, - indice_pairs_desc); - GEN_CASE_DATA_REAL(false, "diff_w", filters_grad, filters_grad_desc); - GEN_CASE_OP_PARAM_SINGLE(0, "indice_convolution_backward", "inverse", - inverse); - GEN_CASE_OP_PARAM_SINGLE(1, "indice_convolution_backward", "subm", subm); - GEN_CASE_OP_PARAM_ARRAY(1, "indice_convolution_backward", "indice_num", - indice_num, indice_pairs_desc->dims[0]); - GEN_CASE_HANDLE_PARAM(); - GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0); -} - -// check input and diffy -static mluOpStatus_t indiceConvDtypeVaild( - const std::string api_name, const mluOpTensorDescriptor_t features_desc, - const mluOpTensorDescriptor_t output_grad_desc, - const mluOpTensorDescriptor_t indice_pairs_desc, - const mluOpTensorDescriptor_t filters_grad_desc) { - auto input_dtype = features_desc->dtype; - auto diffy_dtype = output_grad_desc->dtype; - auto filters_grad_dtype = filters_grad_desc->dtype; - auto pairs_dtype = indice_pairs_desc->dtype; - if (pairs_dtype != MLUOP_DTYPE_INT32) { - LOG(ERROR) << api_name - << " indice_pairs_desc only supports data type int32. " - << "But now the data type is " - << mluOpGetNameOfDataType(pairs_dtype) << "."; - return MLUOP_STATUS_BAD_PARAM; - } - - if (input_dtype != diffy_dtype || input_dtype != filters_grad_dtype || - !isFloatDtype(input_dtype) || !isFloatDtype(diffy_dtype) || - !isFloatDtype(filters_grad_dtype)) { - LOG(ERROR) - << api_name << " The data type of features_desc, output_grad_desc " - << "and filters_grad_desc should be the same and the three should " - << "be either half or float. But now the data types are " - << mluOpGetNameOfDataType(input_dtype) << "-" - << mluOpGetNameOfDataType(diffy_dtype) << "-" - << mluOpGetNameOfDataType(filters_grad_dtype) << "."; - return MLUOP_STATUS_BAD_PARAM; - } - - auto input_on_dtype = features_desc->onchip_dtype; - auto diffy_on_dtype = output_grad_desc->onchip_dtype; - auto filters_grad_on_dtype = filters_grad_desc->onchip_dtype; - auto pairs_on_dtype = indice_pairs_desc->onchip_dtype; - if ((MLUOP_DTYPE_INVALID != input_on_dtype && - input_on_dtype != input_dtype) || - (MLUOP_DTYPE_INVALID != diffy_on_dtype && - diffy_on_dtype != diffy_dtype) || - (MLUOP_DTYPE_INVALID != pairs_on_dtype && - pairs_on_dtype != pairs_dtype)) { - LOG(ERROR) << api_name - << " For features_desc, output_grad_desc and indice_pairs_desc, " - << "there is no need to set the on-chip data type, and if so, " - << "it needs to be the same as their off-chip data type. " - << "But now two data types of features_desc are " - << mluOpGetNameOfDataType(input_dtype) << "-" - << mluOpGetNameOfDataType(input_on_dtype) - << ", output_grad_desc are " - << mluOpGetNameOfDataType(diffy_dtype) << "-" - << mluOpGetNameOfDataType(diffy_on_dtype) - << ", and indice_pairs_desc are " - << mluOpGetNameOfDataType(pairs_dtype) << "-" - << mluOpGetNameOfDataType(pairs_on_dtype) << "."; - return MLUOP_STATUS_BAD_PARAM; - } - - if ((filters_grad_on_dtype != MLUOP_DTYPE_INVALID && - !isFloatDtype(filters_grad_on_dtype)) || - (filters_grad_dtype == MLUOP_DTYPE_FLOAT && - filters_grad_on_dtype == MLUOP_DTYPE_HALF)) { - LOG(ERROR) << api_name << " The on-chip data type of filters_grad_desc " - << "may not be set, if it is set, only half or float types are " - << "supported, and the bit width of on-chip data type can not " - << "be smaller than that of off-chip data type. But now two " - << "data types of filters_grad_desc are " - << mluOpGetNameOfDataType(filters_grad_dtype) << "-" - << mluOpGetNameOfDataType(filters_grad_on_dtype) << "."; - return MLUOP_STATUS_BAD_PARAM; - } - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t baseParamCheck( - const std::string api_name, mluOpHandle_t handle, - const mluOpTensorDescriptor_t features_desc, - const mluOpTensorDescriptor_t output_grad_desc, - const mluOpTensorDescriptor_t indice_pairs_desc, - const mluOpTensorDescriptor_t filters_grad_desc, const int64_t indice_num[], - const int64_t inverse) { - PARAM_CHECK(api_name, handle != nullptr); - PARAM_CHECK(api_name, features_desc != nullptr); - PARAM_CHECK(api_name, output_grad_desc != nullptr); - PARAM_CHECK(api_name, indice_pairs_desc != nullptr); - PARAM_CHECK(api_name, filters_grad_desc != nullptr); - PARAM_CHECK(api_name, indice_num != nullptr); - PARAM_CHECK(api_name, inverse == 0); - - // check mlu platform - if (handle->arch < 372) { - LOG(ERROR) << api_name << " Only mlu300 and above devices are supported." - << " Please check the device version!"; - return MLUOP_STATUS_ARCH_MISMATCH; - } - - // check data type - auto dtype_check = - indiceConvDtypeVaild(api_name, features_desc, output_grad_desc, - indice_pairs_desc, filters_grad_desc); - if (MLUOP_STATUS_SUCCESS != dtype_check) { - return dtype_check; - } - - if (mluOpGetTensorElementNum(features_desc) >= LARGE_TENSOR_NUM || - mluOpGetTensorElementNum(output_grad_desc) >= LARGE_TENSOR_NUM || - mluOpGetTensorElementNum(indice_pairs_desc) >= LARGE_TENSOR_NUM || - mluOpGetTensorElementNum(filters_grad_desc) >= LARGE_TENSOR_NUM) { - LOG(ERROR) << api_name << " Overflow max tensor num." - << " Currently, MLU-OPS supports tensor num smaller than 2^31."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - bool shape_check = true; - if (2 != features_desc->dim || 2 != output_grad_desc->dim || - 3 != indice_pairs_desc->dim || - (4 != filters_grad_desc->dim && 5 != filters_grad_desc->dim)) { - shape_check = false; // dimension check failed! - } - - // only DHWCN/HWCN layout of filter_grad is supported, currently - int32_t filter_dim_len = filters_grad_desc->dim; - auto ci = filters_grad_desc->dims[filter_dim_len - 2]; - auto co = filters_grad_desc->dims[filter_dim_len - 1]; - auto kd = filter_dim_len == 4 ? 1 : filters_grad_desc->dims[0]; - auto kh = filter_dim_len == 4 ? filters_grad_desc->dims[0] - : filters_grad_desc->dims[1]; - auto kw = filter_dim_len == 4 ? filters_grad_desc->dims[1] - : filters_grad_desc->dims[2]; - if (ci != features_desc->dims[1] || co != output_grad_desc->dims[1] || - features_desc->dims[0] != indice_pairs_desc->dims[2] || - 2 != indice_pairs_desc->dims[1] || - kd * kh * kw != indice_pairs_desc->dims[0]) { - shape_check = false; // interdependent dimension check failed! - } - PARAM_CHECK_LE(api_name, indice_pairs_desc->dims[2], - INDICE_IN_LARGE_TENSOR_NUM); - - if (!shape_check) { - LOG(ERROR) << api_name << " Shape check failed! " - << "Now the shapes are features_desc" - << getTensorShapeString(features_desc) << ", output_grad_desc" - << getTensorShapeString(output_grad_desc) - << ", indice_pairs_desc" - << getTensorShapeString(indice_pairs_desc) - << ", and filters_grad_desc" - << getTensorShapeString(filters_grad_desc) << "."; - return MLUOP_STATUS_BAD_PARAM; - } - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t insertTranspose( - const std::string api_name, mluOpHandle_t handle, - const mluOpTensorDescriptor_t filters_grad_desc, - const void *filters_grad_temp, void *filters_grad_buffer, void *workspace, - size_t *size, const bool is_get_workspace, const int32_t kernel_volume, - const int32_t ci, const int32_t co) { - int32_t trans_in_shape[3] = {kernel_volume, ci, co}; - int32_t trans_out_shape[3] = {co, kernel_volume, ci}; // NHWC or NDHWC - int32_t permute[3] = {2, 0, 1}; - if (MLUOP_LAYOUT_NCHW == filters_grad_desc->layout || - MLUOP_LAYOUT_NCDHW == filters_grad_desc->layout) { - trans_out_shape[0] = co; - trans_out_shape[1] = ci; - trans_out_shape[2] = kernel_volume; - permute[0] = 2; - permute[1] = 1; - permute[2] = 0; - } - - size_t transpose_workspace = 0; - mluOpTensorDescriptor_t trans_in_desc, trans_out_desc; - cnnlTransposeDescriptor_t trans_desc; - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&trans_in_desc)); - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&trans_out_desc)); - CALL_CNNL(cnnlCreateTransposeDescriptor(&trans_desc)); - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - trans_in_desc, MLUOP_LAYOUT_ARRAY, - filters_grad_desc->dtype, 3, trans_in_shape)); - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - trans_out_desc, MLUOP_LAYOUT_ARRAY, - filters_grad_desc->dtype, 3, trans_out_shape)); - CALL_CNNL(cnnlSetTransposeDescriptor(trans_desc, 3, permute)); - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(trans_in_desc, cnnl_x_desc); - CALL_CNNL(cnnlGetTransposeWorkspaceSize(cnnl_handle, cnnl_x_desc, - trans_desc, &transpose_workspace)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - if (is_get_workspace) { // is get workspace - *size = transpose_workspace; - } else { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(trans_in_desc, cnnl_x_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(trans_out_desc, cnnl_y_desc); - CALL_CNNL(cnnlTranspose_v2( - cnnl_handle, trans_desc, cnnl_x_desc, filters_grad_temp, cnnl_y_desc, - filters_grad_buffer, workspace, transpose_workspace)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_y_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - - CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(trans_in_desc)); - CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(trans_out_desc)); - CALL_CNNL(cnnlDestroyTransposeDescriptor(trans_desc)); - return MLUOP_STATUS_SUCCESS; -} - -// called by getWorkspace and compute api -// workspace_size is not nullptr when it's from getWorkspace api. -static mluOpStatus_t internalIndiceConvBackwardFilter( - const std::string api_name, mluOpHandle_t handle, - const mluOpTensorDescriptor_t features_desc, const void *features, - const mluOpTensorDescriptor_t output_grad_desc, const void *output_grad, - const mluOpTensorDescriptor_t indice_pairs_desc, const void *indice_pairs, - const int64_t indice_num[], void *workspace, size_t *workspace_size, - const mluOpTensorDescriptor_t filters_grad_desc, void *filters_grad) { - bool is_get_workspace = workspace_size != nullptr ? true : false; - bool filters_grad_need_trans = false; - - // call gather_nd and matmul to finish indice conv. - int32_t kernel_volume = indice_pairs_desc->dims[0]; - int32_t ci = features_desc->dims[1]; - int32_t co = output_grad_desc->dims[1]; - int32_t max_active_num = 0; - for (int32_t i = 0; i < kernel_volume; ++i) { - max_active_num = - indice_num[i] > max_active_num ? indice_num[i] : max_active_num; - } - - int64_t max_input_size = - max_active_num * ci * mluop::getSizeOfDataType(features_desc->dtype); - int64_t max_diffy_size = - max_active_num * co * mluop::getSizeOfDataType(features_desc->dtype); - int64_t filters_grad_trans_size = - filters_grad_need_trans ? filters_grad_desc->total_tensor_size : 0; - - void *filters_grad_temp = filters_grad_need_trans ? workspace : filters_grad; - void *input_temp = (char *)workspace + filters_grad_trans_size; - void *diffy_temp = (char *)input_temp + max_input_size; - void *matmul_ws = (char *)diffy_temp + max_diffy_size; - - // create temp tensor for gather and matmul - mluOpTensorDescriptor_t active_indice_desc; - mluOpTensorDescriptor_t matmul_a_desc, matmul_b_desc, matmul_c_desc; - cnnlMatMulDescriptor_t matmul_desc; - cnnlMatMulAlgo_t matmul_algo; - cnnlMatMulHeuristicResult_t heuristic_result; - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&active_indice_desc)); - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_a_desc)); - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_b_desc)); - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_c_desc)); - CALL_CNNL(cnnlMatMulDescCreate(&matmul_desc)); - CALL_CNNL(cnnlMatMulAlgoCreate(&matmul_algo)); - CALL_CNNL(cnnlCreateMatMulHeuristicResult(&heuristic_result)); - CHECK_RETURN( - api_name, - setMatmulDescInfo(api_name, matmul_desc, 1, 0, - (uint32_t)getOnchipDataType(filters_grad_desc), 0)); - int32_t requested_algo_count = 1, return_algo_count = 0; - float alpha = 1.0, beta = 0.0, fill_value = 0; - size_t matmul_ws_size = 0, temp_matmul_size = 0; - - // filters_grad fill for unused kernel - if (!is_get_workspace) { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filters_grad_desc, - cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value, - cnnl_output_desc, filters_grad_temp)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - - int64_t in_active_num = indice_pairs_desc->dims[2]; - int64_t cico_size = - ci * co * mluop::getSizeOfDataType(filters_grad_desc->dtype); - int64_t pair_low_size = - in_active_num * mluop::getSizeOfDataType(indice_pairs_desc->dtype); - - for (int32_t i = 0; i < kernel_volume; ++i) { - int32_t active_point_num = indice_num[i]; - if (active_point_num <= 0) { - continue; - } - - int32_t active_indices[2] = {active_point_num, 1}; - int32_t a_desc_dims[2] = {active_point_num, ci}; - int32_t b_desc_dims[2] = {active_point_num, co}; - int32_t c_desc_dims[2] = {ci, co}; - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - active_indice_desc, MLUOP_LAYOUT_ARRAY, - indice_pairs_desc->dtype, 2, active_indices)); - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - matmul_a_desc, MLUOP_LAYOUT_ARRAY, - features_desc->dtype, 2, a_desc_dims)); - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - matmul_b_desc, MLUOP_LAYOUT_ARRAY, - output_grad_desc->dtype, 2, b_desc_dims)); - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - matmul_c_desc, MLUOP_LAYOUT_ARRAY, - filters_grad_desc->dtype, 2, c_desc_dims)); - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc, cnnl_a_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc, cnnl_b_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc, cnnl_c_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc, cnnl_d_desc); - CALL_CNNL(cnnlGetMatMulAlgoHeuristic( - cnnl_handle, matmul_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, - cnnl_d_desc, nullptr, requested_algo_count, &heuristic_result, - &return_algo_count)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_b_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_c_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_d_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - CALL_CNNL(cnnlGetMatMulHeuristicResult(heuristic_result, matmul_algo, - &temp_matmul_size)); - - if (is_get_workspace) { - matmul_ws_size = - temp_matmul_size > matmul_ws_size ? temp_matmul_size : matmul_ws_size; - } else { - void *filters_grad_buffer = (char *)filters_grad_temp + i * cico_size; - void *gather_input_indice = (char *)indice_pairs + i * 2 * pair_low_size; - void *gather_output_grad = - (char *)indice_pairs + i * 2 * pair_low_size + pair_low_size; - // gather activate input data [n, ci] - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(features_desc, - cnnl_params_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(active_indice_desc, - cnnl_indices_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc, - cnnl_output_desc); - CALL_CNNL(cnnlGatherNd(cnnl_handle, cnnl_params_desc, features, - cnnl_indices_desc, gather_input_indice, - cnnl_output_desc, input_temp)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_params_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - // gatehr activate diffy data [n, co] - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_grad_desc, - cnnl_params_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(active_indice_desc, - cnnl_indices_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc, - cnnl_output_desc); - CALL_CNNL(cnnlGatherNd(cnnl_handle, cnnl_params_desc, output_grad, - cnnl_indices_desc, gather_output_grad, - cnnl_output_desc, diffy_temp)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_params_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - // get part filters_grad [ci, co] - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc, - cnnl_a_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc, - cnnl_b_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc, - cnnl_c_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc, - cnnl_d_desc); - CALL_CNNL( - cnnlMatMul_v2(cnnl_handle, matmul_desc, matmul_algo, &alpha, - cnnl_a_desc, input_temp, cnnl_b_desc, diffy_temp, - &beta, cnnl_c_desc, filters_grad_buffer, matmul_ws, - temp_matmul_size, cnnl_d_desc, filters_grad_buffer)); - - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_b_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_c_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_d_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - } - } - - // trans temp filters_grad if needed - uint64_t trans_ws_size = 0; - if (filters_grad_need_trans) { - void *trans_ws = input_temp; // multiplexing of space - CHECK_RETURN( - api_name, - insertTranspose(api_name, handle, filters_grad_desc, filters_grad_temp, - filters_grad, trans_ws, &trans_ws_size, - is_get_workspace, kernel_volume, ci, co)); - } - - if (is_get_workspace) { - *workspace_size = filters_grad_trans_size + - std::max(trans_ws_size, max_input_size + max_diffy_size + - matmul_ws_size); - } - - CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(active_indice_desc)); - CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(matmul_a_desc)); - CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(matmul_b_desc)); - CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(matmul_c_desc)); - CALL_CNNL(cnnlMatMulDescDestroy(matmul_desc)); - CALL_CNNL(cnnlMatMulAlgoDestroy(matmul_algo)); - CALL_CNNL(cnnlDestroyMatMulHeuristicResult(heuristic_result)); - return MLUOP_STATUS_SUCCESS; -} - -/***************** workspace **************************/ -/*| temp filters_grad | temp features | temp output_grad| matmul_ws | */ -/*| temp filters_grad | transpose_ws | */ -/* multiplexing of space:(transpose_ws, temp_input + temp_diffy + - * matmul_ws) */ -mluOpStatus_t MLUOP_WIN_API -mluOpGetIndiceConvolutionBackwardFilterWorkspaceSize( - mluOpHandle_t handle, const mluOpTensorDescriptor_t features_desc, - const mluOpTensorDescriptor_t output_grad_desc, - const mluOpTensorDescriptor_t indice_pairs_desc, - const mluOpTensorDescriptor_t filters_grad_desc, const int64_t indice_num[], - const int64_t inverse, const int64_t subm, size_t *size) { - const std::string api_name = - "[mluOpGetIndiceConvolutionBackwardFilterWorkspaceSize]"; - PARAM_CHECK(api_name, size != nullptr); - auto basic_check = - baseParamCheck(api_name, handle, features_desc, output_grad_desc, - indice_pairs_desc, filters_grad_desc, indice_num, inverse); - if (MLUOP_STATUS_SUCCESS != basic_check) { - return basic_check; - } - - // zero element check - if (0 == features_desc->total_element_num || - 0 == output_grad_desc->total_element_num || - 0 == indice_pairs_desc->total_element_num || - 0 == filters_grad_desc->total_element_num) { - VLOG(5) << api_name << " Skip zero element tensor."; - return MLUOP_STATUS_SUCCESS; - } - - CHECK_RETURN(api_name, - internalIndiceConvBackwardFilter( - api_name, handle, features_desc, nullptr, output_grad_desc, - nullptr, indice_pairs_desc, nullptr, indice_num, nullptr, - size, filters_grad_desc, nullptr)); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpIndiceConvolutionBackwardFilter( - mluOpHandle_t handle, const mluOpTensorDescriptor_t features_desc, - const void *features, const mluOpTensorDescriptor_t output_grad_desc, - const void *output_grad, const mluOpTensorDescriptor_t indice_pairs_desc, - const void *indice_pairs, const int64_t indice_num[], const int64_t inverse, - const int64_t subm, void *workspace, size_t workspace_size, - const mluOpTensorDescriptor_t filters_grad_desc, void *filters_grad) { - const std::string api_name = "[mluOpIndiceConvolutionBackwardFilter]"; - - auto basic_check = - baseParamCheck(api_name, handle, features_desc, output_grad_desc, - indice_pairs_desc, filters_grad_desc, indice_num, inverse); - if (MLUOP_STATUS_SUCCESS != basic_check) { - return basic_check; - } - - // zero element check - if (0 == features_desc->total_element_num || - 0 == output_grad_desc->total_element_num || - 0 == indice_pairs_desc->total_element_num || - 0 == filters_grad_desc->total_element_num) { - VLOG(5) << api_name << " Skip zero element tensor."; - return MLUOP_STATUS_SUCCESS; - } - - // check data ptr - PARAM_CHECK(api_name, features != nullptr); - PARAM_CHECK(api_name, output_grad != nullptr); - PARAM_CHECK(api_name, indice_pairs != nullptr); - PARAM_CHECK(api_name, filters_grad != nullptr); - if (workspace_size > 0) { - PARAM_CHECK(api_name, workspace != nullptr); - } - - // gen_case - if (MLUOP_GEN_CASE_ON_NEW) { - indiceConvFilterGencase(handle, features_desc, features, output_grad_desc, - output_grad, indice_pairs_desc, indice_pairs, - indice_num, inverse, subm, workspace, - workspace_size, filters_grad_desc, filters_grad); - } - - CHECK_RETURN(api_name, - internalIndiceConvBackwardFilter( - api_name, handle, features_desc, features, output_grad_desc, - output_grad, indice_pairs_desc, indice_pairs, indice_num, - workspace, nullptr, filters_grad_desc, filters_grad)); - - GEN_CASE_END(); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/indice_convolution_forward/indice_convolution_forward.cpp b/kernels/indice_convolution_forward/indice_convolution_forward.cpp deleted file mode 100644 index 3c7f024f8..000000000 --- a/kernels/indice_convolution_forward/indice_convolution_forward.cpp +++ /dev/null @@ -1,636 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include -#include - -#include "core/context.h" -#include "core/gen_case.h" -#include "core/logging.h" -#include "core/mlu_env.h" -#include "core/tensor.h" -#include "core/type.h" -#include "kernels/kernel.h" -#include "kernels/utils/cnnl_helper.h" -#include "mlu_op.h" - -static mluOpStatus_t foolProof( - const std::string api_name, mluOpHandle_t handle, - const mluOpTensorDescriptor_t features_desc, - const mluOpTensorDescriptor_t filters_desc, - const mluOpTensorDescriptor_t indice_pairs_desc, const int64_t indice_num[], - const int64_t num_act_out, const int64_t inverse, const int64_t sub_m, - const mluOpTensorDescriptor_t features_out_desc) { - // nullptr check - PARAM_CHECK(api_name, handle != nullptr); - PARAM_CHECK(api_name, features_desc != nullptr); - PARAM_CHECK(api_name, filters_desc != nullptr); - PARAM_CHECK(api_name, indice_pairs_desc != nullptr); - PARAM_CHECK(api_name, indice_num != nullptr); - PARAM_CHECK(api_name, features_out_desc != nullptr); - - // platform check - if (handle->arch < 372) { - LOG(ERROR) << api_name << "Only mlu300 and above devices are supported." - << "Please check the device version!"; - return MLUOP_STATUS_ARCH_MISMATCH; - } - - // data type check - PARAM_CHECK(api_name, features_desc->dtype == MLUOP_DTYPE_FLOAT || - features_desc->dtype == MLUOP_DTYPE_HALF); - PARAM_CHECK(api_name, filters_desc->dtype == MLUOP_DTYPE_FLOAT || - filters_desc->dtype == MLUOP_DTYPE_HALF); - PARAM_CHECK(api_name, indice_pairs_desc->dtype == MLUOP_DTYPE_INT32); - PARAM_CHECK(api_name, features_out_desc->dtype == MLUOP_DTYPE_FLOAT || - features_out_desc->dtype == MLUOP_DTYPE_HALF); - PARAM_CHECK(api_name, features_desc->dtype == features_out_desc->dtype && - features_desc->dtype == filters_desc->dtype); - - // inverse not supported now - PARAM_CHECK(api_name, sub_m == 0 || sub_m == 1); - PARAM_CHECK(api_name, inverse == 0 || inverse == 1); - if (inverse != 0) { - LOG(ERROR) << api_name << "inverse is: " << inverse - << ", which is not supported now."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - - // layout check - // DHWCN layout not supported yet, use ARRAY temporarily - // PARAM_CHECK(api_name, filters_desc->layout == MLUOP_LAYOUT_DHWCN); - if (filters_desc->layout != MLUOP_LAYOUT_NDHWC && - filters_desc->layout != MLUOP_LAYOUT_NCDHW && - filters_desc->layout != MLUOP_LAYOUT_ARRAY) { - LOG(ERROR) << api_name << "The layout of filters is: " - << mluOpGetNameOfTensorLayout(filters_desc->layout) - << ", which is not supported now."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - - // shape check - PARAM_CHECK(api_name, features_desc->dim == 2); - PARAM_CHECK(api_name, indice_pairs_desc->dim == 3); - PARAM_CHECK(api_name, features_out_desc->dim == 2); - if (filters_desc->dim != 5) { - LOG(ERROR) << api_name - << "The filters dimension number only support 5 currently," - << " but filters dimension number is :" << filters_desc->dim - << "."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - - // large tensor - if (mluOpGetTensorElementNum(features_desc) >= LARGE_TENSOR_NUM || - mluOpGetTensorElementNum(filters_desc) >= LARGE_TENSOR_NUM || - mluOpGetTensorElementNum(indice_pairs_desc) >= LARGE_TENSOR_NUM || - mluOpGetTensorElementNum(features_out_desc) >= LARGE_TENSOR_NUM) { - LOG(ERROR) << api_name << "Max tensor number overflow. Currently, " - << "MLU-OPS supports tensor elemenets number smaller than 2^31."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - - auto ci = 0; - auto num_filter = 0; - auto co = 0; - if (filters_desc->layout == MLUOP_LAYOUT_ARRAY) { - ci = filters_desc->dims[3]; - num_filter = - filters_desc->dims[0] * filters_desc->dims[1] * filters_desc->dims[2]; - co = filters_desc->dims[4]; - } else { - ci = mluOpGetTensordimC(filters_desc); - num_filter = mluOpGetTensordimD(filters_desc) * - mluOpGetTensordimH(filters_desc) * - mluOpGetTensordimW(filters_desc); - co = mluOpGetTensordimN(filters_desc); - } - - // features shape check - PARAM_CHECK(api_name, features_desc->dims[0] == indice_pairs_desc->dims[2]); - PARAM_CHECK(api_name, features_desc->dims[1] == ci); - - // indice_pairs shape check - PARAM_CHECK(api_name, indice_pairs_desc->dims[0] == num_filter); - PARAM_CHECK(api_name, indice_pairs_desc->dims[1] == 2); - - // features_out shape check - PARAM_CHECK(api_name, features_out_desc->dims[0] == num_act_out); - PARAM_CHECK(api_name, features_out_desc->dims[1] == co); - - // indice_num[] check - for (int i = 0; i < num_filter; ++i) { - PARAM_CHECK(api_name, - indice_num[i] >= 0 && indice_num[i] <= features_desc->dims[0]); - } - - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t mainIndiceConvolutionForward( - const std::string api_name, mluOpHandle_t handle, - const mluOpTensorDescriptor_t features_desc, const void *features, - const mluOpTensorDescriptor_t filters_desc, const void *filters, - const mluOpTensorDescriptor_t indice_pairs_desc, const void *indice_pairs, - const int64_t indice_num[], const int64_t num_act_out, void *workspace, - size_t *workspace_size, const mluOpTensorDescriptor_t features_out_desc, - void *features_out) { - // param init - bool is_workspace_compute = workspace_size != nullptr ? true : false; - bool filters_need_trans = true; - int32_t ci = 0; - int32_t co = 0; - // MLUOP_LAYOUT_DHWCN not supported yet. - if (filters_desc->layout == MLUOP_LAYOUT_ARRAY) { - filters_need_trans = false; - ci = filters_desc->dims[3]; - co = filters_desc->dims[4]; - } else { - ci = mluOpGetTensordimC(filters_desc); - co = mluOpGetTensordimN(filters_desc); - } - int32_t num_filter = indice_pairs_desc->dims[0]; - - int64_t num_act_in = indice_pairs_desc->dims[2]; - int64_t elementSize_filters = - ci * co * mluop::getSizeOfDataType(filters_desc->dtype); - int64_t elementSize_indice_pairs = - num_act_in * mluop::getSizeOfDataType(indice_pairs_desc->dtype); - - int32_t max_indice_num = 0; - for (int i = 0; i < num_filter; ++i) { - max_indice_num = - indice_num[i] > max_indice_num ? indice_num[i] : max_indice_num; - } - size_t workspaceSize_gather = - max_indice_num * ci * mluop::getSizeOfDataType(features_desc->dtype); - size_t workspaceSize_matmul = - max_indice_num * co * mluop::getSizeOfDataType(features_out_desc->dtype); - size_t workspaceSize_transpose = 0; - size_t workspaceSize_transposeExtra = 0; - if (filters_need_trans) { - workspaceSize_transpose = - num_filter * ci * co * mluop::getSizeOfDataType(filters_desc->dtype); - } - size_t workspaceSize_scatter = - num_act_out * co * mluop::getSizeOfDataType(features_out_desc->dtype); - size_t workspaceSize_matmulExtra = 0; - size_t tempSize_matmulExtra = 0; - size_t workspaceSize_addNExtra = 0; - size_t tempSize_addNExtra = 0; - size_t workspaceSize_maximum = 0; - - float matmul_alpha = 1.0; - float matmul_beta = 0.0; - int matmul_requested_algo = 1; - int matmul_recieved_algo = 0; - int matmul_is_transA = 0; - int matmul_is_transB = 0; - uint32_t matmul_allow_TF32 = 0; - uint32_t matmul_computetype = (uint32_t)filters_desc->dtype; - - // allocate workspace segment for intermediate data - void *validFilters_ptr = filters_need_trans ? workspace : (void *)filters; - void *transposeExtra_ptr = (char *)workspace + workspaceSize_transpose; - void *matmulResult_ptr = (char *)workspace + workspaceSize_transpose; - void *gatherResult_ptr = (char *)matmulResult_ptr + workspaceSize_matmul; - void *matmulExtra_ptr = (char *)gatherResult_ptr + workspaceSize_gather; - void *scatterResult_ptr = (char *)matmulResult_ptr + workspaceSize_matmul; - void *addNExtra_ptr = (char *)scatterResult_ptr + workspaceSize_scatter; - void *addN_ptrs[2] = {scatterResult_ptr, features_out}; - - // create intermediate tensor - mluOpTensorDescriptor_t active_indice_desc; - mluOpTensorDescriptor_t matmul_a_desc, matmul_b_desc, matmul_c_desc; - cnnlMatMulDescriptor_t matmul_desc; - // mluOpTensorDescriptor_t addN_descriptors[2] = {features_out_desc, - // features_out_desc}; - cnnlMatMulAlgo_t matmul_algo; - cnnlMatMulHeuristicResult_t heuristic_result; - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&active_indice_desc)); - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_a_desc)); - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_b_desc)); - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_c_desc)); - CALL_CNNL(cnnlMatMulDescCreate(&matmul_desc)); - CALL_CNNL(cnnlMatMulAlgoCreate(&matmul_algo)); - CALL_CNNL(cnnlCreateMatMulHeuristicResult(&heuristic_result)); - - CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_TRANSA, - &matmul_is_transA, sizeof(int32_t))); - CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_TRANSB, - &matmul_is_transB, sizeof(int32_t))); - CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_COMPUTE_TYPE, - &matmul_computetype, sizeof(int32_t))); - CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_ALLOW_TF32, - &matmul_allow_TF32, sizeof(int32_t))); - - // transpose filters to DHWNC layout - if (filters_need_trans) { - int trans_in_shape[3] = {0, 0, 0}; - int trans_out_shape[3] = {num_filter, ci, co}; - int permute[3] = {0, 0, 0}; - if (MLUOP_LAYOUT_NDHWC == filters_desc->layout) { - trans_in_shape[0] = co; - trans_in_shape[1] = num_filter; - trans_in_shape[2] = ci; - permute[0] = 1; - permute[1] = 2; - permute[2] = 0; - } else { - // MLUOP_LAYOUT_NCDHW == filters_desc->layout - trans_in_shape[0] = co; - trans_in_shape[1] = ci; - trans_in_shape[2] = num_filter; - permute[0] = 2; - permute[1] = 1; - permute[2] = 0; - } - mluOpTensorDescriptor_t trans_in_desc, trans_out_desc; - cnnlTransposeDescriptor_t trans_desc; - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&trans_in_desc)); - CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&trans_out_desc)); - CALL_CNNL(cnnlCreateTransposeDescriptor(&trans_desc)); - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - trans_in_desc, MLUOP_LAYOUT_ARRAY, - filters_desc->dtype, 3, trans_in_shape)); - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - trans_out_desc, MLUOP_LAYOUT_ARRAY, - filters_desc->dtype, 3, trans_out_shape)); - CALL_CNNL(cnnlSetTransposeDescriptor(trans_desc, 3, permute)); - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(trans_in_desc, cnnl_x_desc); - CALL_CNNL(cnnlGetTransposeWorkspaceSize( - cnnl_handle, cnnl_x_desc, trans_desc, &workspaceSize_transposeExtra)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - if (!is_workspace_compute) { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(trans_in_desc, cnnl_x_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(trans_out_desc, cnnl_y_desc); - CALL_CNNL(cnnlTranspose_v2( - cnnl_handle, trans_desc, cnnl_x_desc, filters, cnnl_y_desc, - validFilters_ptr, transposeExtra_ptr, workspaceSize_transposeExtra)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_y_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(trans_in_desc)); - CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(trans_out_desc)); - CALL_CNNL(cnnlDestroyTransposeDescriptor(trans_desc)); - } - - // invoke gather_nd and matmul to finish indice conv - int32_t active_point_num = 0; - int32_t active_indice[2] = {0, 1}; - int32_t matmul_a_shape[2] = {0, ci}; - int32_t matmul_b_shape[2] = {ci, co}; - int32_t matmul_c_shape[2] = {0, co}; - float init_val = 0; - - if (!is_workspace_compute) { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(features_out_desc, - cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &init_val, - cnnl_output_desc, features_out)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - - for (int i = 0; i < num_filter; ++i) { - active_point_num = indice_num[i]; - if (active_point_num <= 0) { - continue; - } - active_indice[0] = active_point_num; - matmul_a_shape[0] = active_point_num; - matmul_c_shape[0] = active_point_num; - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - active_indice_desc, MLUOP_LAYOUT_ARRAY, - indice_pairs_desc->dtype, 2, active_indice)); - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - matmul_a_desc, MLUOP_LAYOUT_ARRAY, - features_desc->dtype, 2, matmul_a_shape)); - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - matmul_b_desc, MLUOP_LAYOUT_ARRAY, - features_out_desc->dtype, 2, matmul_b_shape)); - CHECK_RETURN(api_name, mluOpSetTensorDescriptor( - matmul_c_desc, MLUOP_LAYOUT_ARRAY, - features_desc->dtype, 2, matmul_c_shape)); - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc, cnnl_a_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc, cnnl_b_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc, cnnl_c_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc, cnnl_d_desc); - CALL_CNNL(cnnlGetMatMulAlgoHeuristic( - cnnl_handle, matmul_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, - cnnl_d_desc, nullptr, matmul_requested_algo, &heuristic_result, - &matmul_recieved_algo)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_b_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_c_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_d_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - CALL_CNNL(cnnlGetMatMulHeuristicResult(heuristic_result, matmul_algo, - &tempSize_matmulExtra)); - uint32_t addn_num = 2; - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - cnnlTensorDescriptor_t *cnnl_input_descs = - (cnnlTensorDescriptor_t *)malloc(sizeof(cnnlTensorDescriptor_t) * - addn_num); - for (int i = 0; i < addn_num; i++) { - CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(features_out_desc, - cnnl_input_descs[i]); - } - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(features_out_desc, - cnnl_output_desc); - CHECK_FUNC_RETURN( - cnnlGetAddNWorkspaceSize(cnnl_handle, cnnl_input_descs, addn_num, - cnnl_output_desc, &tempSize_addNExtra), - CNNL_STATUS_SUCCESS, - "[cnnlAddN_v2] Internal error accured in cnnlGetAddNWorkspaceSize.", - MLUOP_STATUS_INTERNAL_ERROR); - for (int i = 0; i < addn_num; i++) { - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_descs[i]); - } - free(cnnl_input_descs); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - - if (is_workspace_compute) { - workspaceSize_matmulExtra = - tempSize_matmulExtra > workspaceSize_matmulExtra - ? tempSize_matmulExtra - : workspaceSize_matmulExtra; - workspaceSize_addNExtra = tempSize_addNExtra > workspaceSize_addNExtra - ? tempSize_addNExtra - : workspaceSize_addNExtra; - } else { - void *filters_buffer = (char *)validFilters_ptr + i * elementSize_filters; - void *gatherIndice_buffer = - (char *)indice_pairs + i * 2 * elementSize_indice_pairs; - void *scatterAddIndice_buffer = - (char *)indice_pairs + (i * 2 + 1) * elementSize_indice_pairs; - // invoke gather to get input data: - // [num_act_in, ci] -> [indice_pairs_num[i], ci] - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(features_desc, - cnnl_params_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(active_indice_desc, - cnnl_indices_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc, - cnnl_output_desc); - CALL_CNNL(cnnlGatherNd(cnnl_handle, cnnl_params_desc, features, - cnnl_indices_desc, gatherIndice_buffer, - cnnl_output_desc, gatherResult_ptr)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_params_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - // invoke matmul to get intermediate result: - // [indice_pairs_num[i], ci] * [ci, co] = [indice_pairs_num[i], co] - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc, - cnnl_a_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc, - cnnl_b_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc, - cnnl_c_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc, - cnnl_d_desc); - CALL_CNNL(cnnlMatMul_v2( - cnnl_handle, matmul_desc, matmul_algo, &matmul_alpha, cnnl_a_desc, - gatherResult_ptr, cnnl_b_desc, filters_buffer, &matmul_beta, - cnnl_c_desc, matmulResult_ptr, matmulExtra_ptr, - tempSize_matmulExtra, cnnl_d_desc, matmulResult_ptr)); - - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_b_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_c_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_d_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(features_out_desc, - cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &init_val, - cnnl_output_desc, scatterResult_ptr)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - - // invoke scatter_add to add intermediate result to final result: - // [indice_num[i], co] -> [num_act_out, co] - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(active_indice_desc, - cnnl_indices_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc, - cnnl_updates_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(features_out_desc, - cnnl_input_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(features_out_desc, - cnnl_output_desc); - - CALL_CNNL(cnnlScatterNd_v2(cnnl_handle, CNNL_SCATTERND_UPDATE, - cnnl_indices_desc, scatterAddIndice_buffer, - cnnl_updates_desc, matmulResult_ptr, - cnnl_input_desc, scatterResult_ptr, - cnnl_output_desc, scatterResult_ptr)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_updates_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - - { - int addn_num = 2; - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - cnnlTensorDescriptor_t *cnnl_input_descs = - (cnnlTensorDescriptor_t *)malloc(sizeof(cnnlTensorDescriptor_t) * - addn_num); - for (int i = 0; i < addn_num; i++) { - CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(features_out_desc, - cnnl_input_descs[i]); - } - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(features_out_desc, - cnnl_output_desc); - - CALL_CNNL(cnnlAddN_v2(cnnl_handle, cnnl_input_descs, addN_ptrs, - addn_num, cnnl_output_desc, features_out, - addNExtra_ptr, tempSize_addNExtra)); - for (int i = 0; i < addn_num; i++) { - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_descs[i]); - } - free(cnnl_input_descs); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - } - } - if (is_workspace_compute) { - workspaceSize_maximum = std::max( - workspaceSize_matmul + workspaceSize_gather + workspaceSize_matmulExtra, - workspaceSize_transposeExtra); - workspaceSize_maximum = std::max( - workspaceSize_matmul + workspaceSize_scatter + workspaceSize_addNExtra, - workspaceSize_maximum); - *workspace_size = workspaceSize_transpose + workspaceSize_maximum; - } - CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(active_indice_desc)); - CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(matmul_a_desc)); - CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(matmul_b_desc)); - CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(matmul_c_desc)); - CALL_CNNL(cnnlMatMulDescDestroy(matmul_desc)); - CALL_CNNL(cnnlMatMulAlgoDestroy(matmul_algo)); - CALL_CNNL(cnnlDestroyMatMulHeuristicResult(heuristic_result)); - return MLUOP_STATUS_SUCCESS; -} - -// workspace composition: -// | transposed filters | transpose_extra | -// || -// \/ -// | transposed filters | matmul_result | gather_result | matmul_extra | -// || -// \/ -// | transposed filters | matmul_result | scatter_result | addN_extra | -mluOpStatus_t MLUOP_WIN_API mluOpGetIndiceConvolutionForwardWorkspaceSize( - mluOpHandle_t handle, const mluOpTensorDescriptor_t features_desc, - const mluOpTensorDescriptor_t filters_desc, - const mluOpTensorDescriptor_t indice_pairs_desc, - const mluOpTensorDescriptor_t features_out_desc, const int64_t indice_num[], - const int64_t num_act_out, const int64_t inverse, const int64_t sub_m, - size_t *size) { - const std::string api_name = - "[mluOpGetIndiceConvolutionForwardWorkspaceSize]"; - - // foolproof check - auto fool_proof = foolProof(api_name, handle, features_desc, filters_desc, - indice_pairs_desc, indice_num, num_act_out, - inverse, sub_m, features_out_desc); - if (fool_proof != MLUOP_STATUS_SUCCESS) { - return fool_proof; - } - - // zero element - if (mluOpGetTensorElementNum(features_desc) == 0 || - mluOpGetTensorElementNum(indice_pairs_desc) == 0 || - mluOpGetTensorElementNum(filters_desc) == 0 || - mluOpGetTensorElementNum(features_out_desc) == 0) { - VLOG(5) << api_name << "Skip zero element tensor."; - return MLUOP_STATUS_SUCCESS; - } - - // nullptr check - PARAM_CHECK(api_name, size != nullptr); - - // main process - CHECK_RETURN(api_name, - mainIndiceConvolutionForward( - api_name, handle, features_desc, nullptr, filters_desc, - nullptr, indice_pairs_desc, nullptr, indice_num, num_act_out, - nullptr, size, features_out_desc, nullptr)); - VLOG(5) << api_name << "workspace size: " << *size << "."; - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpIndiceConvolutionForward( - mluOpHandle_t handle, const mluOpTensorDescriptor_t features_desc, - const void *features, const mluOpTensorDescriptor_t filters_desc, - const void *filters, const mluOpTensorDescriptor_t indice_pairs_desc, - const void *indice_pairs, const int64_t indice_num[], - const int64_t num_act_out, const int64_t inverse, const int64_t sub_m, - void *workspace, const size_t workspace_size, - const mluOpTensorDescriptor_t features_out_desc, void *features_out) { - const std::string api_name = "[mluOpIndiceConvolutionForward]"; - - // foolproof check - auto fool_proof = foolProof(api_name, handle, features_desc, filters_desc, - indice_pairs_desc, indice_num, num_act_out, - inverse, sub_m, features_out_desc); - if (fool_proof != MLUOP_STATUS_SUCCESS) { - return fool_proof; - } - - // zero element - if (mluOpGetTensorElementNum(filters_desc) == 0 || - mluOpGetTensorElementNum(features_desc) == 0 || - mluOpGetTensorElementNum(indice_pairs_desc) == 0 || - mluOpGetTensorElementNum(features_out_desc) == 0) { - VLOG(5) << api_name << "Skip zero element tensor."; - return MLUOP_STATUS_SUCCESS; - } - - // data pointer nullptr check - PARAM_CHECK(api_name, features != nullptr); - PARAM_CHECK(api_name, filters != nullptr); - PARAM_CHECK(api_name, indice_pairs != nullptr); - PARAM_CHECK(api_name, features_out != nullptr); - if (workspace_size > 0) { - PARAM_CHECK(api_name, workspace != nullptr); - } - - // gen_case - if (MLUOP_GEN_CASE_ON_NEW) { - GEN_CASE_START("indice_convolution_forward"); - GEN_CASE_HANDLE(handle); - GEN_CASE_DATA_REAL(true, "features", features, features_desc); - GEN_CASE_DATA_REAL(true, "filters", filters, filters_desc); - GEN_CASE_DATA_REAL(true, "indice_pairs_desc", indice_pairs, - indice_pairs_desc); - GEN_CASE_DATA_REAL(false, "features_out", features_out, features_out_desc); - GEN_CASE_OP_PARAM_SINGLE(0, "indice_convolution_forward", "inverse", - inverse); - GEN_CASE_OP_PARAM_SINGLE(1, "indice_convolution_forward", "sub_m", sub_m); - GEN_CASE_OP_PARAM_ARRAY(1, "indice_convolution_forward", "indice_num", - indice_num, indice_pairs_desc->dims[0]); - GEN_CASE_OP_PARAM_SINGLE(1, "indice_convolution_forward", "num_active_out", - num_act_out); - GEN_CASE_HANDLE_PARAM(); - GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0); - } - - // main process - CHECK_RETURN(api_name, mainIndiceConvolutionForward( - api_name, handle, features_desc, features, - filters_desc, filters, indice_pairs_desc, - indice_pairs, indice_num, num_act_out, workspace, - nullptr, features_out_desc, features_out)); - - if (MLUOP_GEN_CASE_ON_NEW) { - GEN_CASE_END(); - } - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/masked_col2im_forward/masked_col2im_forward.cpp b/kernels/masked_col2im_forward/masked_col2im_forward.cpp deleted file mode 100644 index 97d30717c..000000000 --- a/kernels/masked_col2im_forward/masked_col2im_forward.cpp +++ /dev/null @@ -1,346 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "masked_col2im_forward.h" - -#include "core/context.h" -#include "core/gen_case.h" -#include "core/logging.h" -#include "core/runtime/device.h" -#include "core/tensor.h" -#include "core/type.h" -#include "kernels/kernel.h" -#include "kernels/utils/cnnl_helper.h" - -static void policyFunc(const mluOpHandle_t handle, const int mask_cnt, - cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) { - const size_t cluster_limit = - mluop::runtime::getClusterLimitCapability(handle); - const size_t core_limit = - mluop::runtime::getCoreNumOfEachUnionCapability(handle); - const size_t task_dim = CEIL_ALIGN(mask_cnt, core_limit); - k_dim->x = core_limit; - k_dim->y = (task_dim / core_limit) > cluster_limit ? cluster_limit - : (task_dim / core_limit); - k_dim->z = 1; - *k_type = CNRT_FUNC_TYPE_UNION1; -} - -static mluOpStatus_t maskedCol2imForwardPreCheck( - const mluOpTensorDescriptor_t col_desc, - const mluOpTensorDescriptor_t mask_h_idx_desc, - const mluOpTensorDescriptor_t mask_w_idx_desc, - const mluOpTensorDescriptor_t im_desc) { - PARAM_CHECK("[mluOpMaskedCol2imForward]", col_desc != NULL); - PARAM_CHECK("[mluOpMaskedCol2imForward]", mask_h_idx_desc != NULL); - PARAM_CHECK("[mluOpMaskedCol2imForward]", mask_w_idx_desc != NULL); - PARAM_CHECK("[mluOpMaskedCol2imForward]", im_desc != NULL); - PARAM_CHECK("[mluOpMaskedCol2imForward]", col_desc->dim == 2); - PARAM_CHECK("[mluOpMaskedCol2imForward]", im_desc->dim == 4); - PARAM_CHECK("[mluOpMaskedCol2imForward]", mask_h_idx_desc->dim == 1); - PARAM_CHECK("[mluOpMaskedCol2imForward]", mask_w_idx_desc->dim == 1); - PARAM_CHECK("[mluOpMaskedCol2imForward]", - im_desc->layout == MLUOP_LAYOUT_NCHW); - PARAM_CHECK("[mluOpMaskedCol2imForward]", - col_desc->dtype == MLUOP_DTYPE_FLOAT || - col_desc->dtype == MLUOP_DTYPE_HALF); - PARAM_CHECK("[mluOpMaskedCol2imForward]", col_desc->dtype == im_desc->dtype); - PARAM_CHECK("[mluOpMaskedCol2imForward]", - mask_h_idx_desc->dtype == MLUOP_DTYPE_INT32); - PARAM_CHECK("[mluOpMaskedCol2imForward]", - mask_w_idx_desc->dtype == MLUOP_DTYPE_INT32); - PARAM_CHECK("[mluOpMaskedCol2imForward]", im_desc->dims[0] == 1); - - PARAM_CHECK("[mluOpMaskedCol2imForward]", - mask_h_idx_desc->dims[0] == mask_w_idx_desc->dims[0]); - PARAM_CHECK("[mluOpMaskedCol2imForward]", - col_desc->dims[1] == mask_h_idx_desc->dims[0]); - PARAM_CHECK("[mluOpMaskedCol2imForward]", - col_desc->dims[0] == im_desc->dims[1]); - - const size_t col_element_num = mluOpGetTensorElementNum(col_desc); - const size_t mask_h_idx_element_num = - mluOpGetTensorElementNum(mask_h_idx_desc); - const size_t im_element_num = mluOpGetTensorElementNum(im_desc); - TENSOR_NUM_CHECK("[mluOpMaskedCol2imForward]", col_element_num, - LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK("[mluOpMaskedCol2imForward]", mask_h_idx_element_num, - LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK("[mluOpMaskedCol2imForward]", im_element_num, - LARGE_TENSOR_NUM, ""); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpGetMaskedCol2imForwardWorkspaceSize( - mluOpHandle_t handle, const mluOpTensorDescriptor_t col_desc, - const mluOpTensorDescriptor_t mask_h_idx_desc, - const mluOpTensorDescriptor_t mask_w_idx_desc, - const mluOpTensorDescriptor_t im_desc, size_t *workspace_size) { - mluOpStatus_t status = MLUOP_STATUS_BAD_PARAM; - PARAM_CHECK("[mluOpMaskedCol2imForward]", handle != NULL); - PARAM_CHECK("[mluOpMaskedCol2imForward]", workspace_size != NULL); - status = maskedCol2imForwardPreCheck(col_desc, mask_h_idx_desc, - mask_w_idx_desc, im_desc); - if (MLUOP_STATUS_SUCCESS != status) { - return status; - } - if (mluOpGetTensorElementNum(im_desc) == 0 || col_desc->dims[0] == 0) { - LOG(ERROR) << "[mluOpMaskedCol2imForward] Zero element tensor failure."; - return MLUOP_STATUS_BAD_PARAM; - } - if (mluOpGetTensorElementNum(mask_h_idx_desc) == 0) { - VLOG(5) << "[mluOpMaskedCol2imForward] Skip zero element tensor."; - return MLUOP_STATUS_SUCCESS; - } - *workspace_size = col_desc->total_tensor_size; - *workspace_size += im_desc->total_tensor_size; - - cnnlTransposeDescriptor_t trans_desc; - size_t col_transpose_workspace_size = 0; - int col_dim = col_desc->dim; - int col_permute[2] = {1, 0}; - int col_MC_dims[2] = {0, 0}; - col_MC_dims[0] = col_desc->dims[1]; - col_MC_dims[1] = col_desc->dims[0]; - mluOpTensorDescriptor_t col_MC_desc_tmp; - MLUOP_CHECK(mluOpCreateTensorDescriptor(&col_MC_desc_tmp)); - - PARAM_CHECK( - "[mluOpMaskedCol2imForward]", - MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor(col_MC_desc_tmp, MLUOP_LAYOUT_ARRAY, - col_desc->dtype, col_dim, col_MC_dims)); - CALL_CNNL(cnnlCreateTransposeDescriptor(&trans_desc)); - CALL_CNNL(cnnlSetTransposeDescriptor(trans_desc, col_dim, col_permute)); - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(col_MC_desc_tmp, cnnl_x_desc); - CALL_CNNL(cnnlGetTransposeWorkspaceSize( - cnnl_handle, cnnl_x_desc, trans_desc, &col_transpose_workspace_size)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - int im_dim = im_desc->dim; - int im_permute[4] = {0, 3, 1, 2}; - int NCHW2NHWC_permute[4] = {0, 2, 3, 1}; - int im_NHWC_dims[4] = {0, 0, 0, 0}; - for (int i = 0; i < im_dim; ++i) { - im_NHWC_dims[i] = im_desc->dims[NCHW2NHWC_permute[i]]; - } - size_t im_transpose_workspace_size = 0; - mluOpTensorDescriptor_t im_NHWC_desc_tmp; - MLUOP_CHECK(mluOpCreateTensorDescriptor(&im_NHWC_desc_tmp)); - - PARAM_CHECK( - "[mluOpMaskedCol2imForward]", - MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor(im_NHWC_desc_tmp, MLUOP_LAYOUT_ARRAY, - im_desc->dtype, im_dim, im_NHWC_dims)); - CALL_CNNL(cnnlSetTransposeDescriptor(trans_desc, im_dim, im_permute)); - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(im_NHWC_desc_tmp, cnnl_x_desc); - CALL_CNNL(cnnlGetTransposeWorkspaceSize( - cnnl_handle, cnnl_x_desc, trans_desc, &im_transpose_workspace_size)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - *workspace_size += im_transpose_workspace_size > col_transpose_workspace_size - ? im_transpose_workspace_size - : col_transpose_workspace_size; - PARAM_CHECK( - "[mluOpMaskedCol2imForward]", - MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(im_NHWC_desc_tmp)); - CALL_CNNL(cnnlDestroyTransposeDescriptor(trans_desc)); - PARAM_CHECK( - "[mluOpMaskedCol2imForward]", - MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(col_MC_desc_tmp)); - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t transposeTensor( - mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, - const void *input, const int *permute, - const mluOpTensorDescriptor_t workspace_dst_desc, void *workspace_dst, - void *transpose_workspace) { - const int input_dim = input_desc->dim; - cnnlTransposeDescriptor_t trans_desc; - size_t transpose_workspace_size = 0; - CALL_CNNL(cnnlCreateTransposeDescriptor(&trans_desc)); - CALL_CNNL(cnnlSetTransposeDescriptor(trans_desc, input_dim, permute)); - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_x_desc); - CALL_CNNL(cnnlGetTransposeWorkspaceSize( - cnnl_handle, cnnl_x_desc, trans_desc, &transpose_workspace_size)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_x_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(workspace_dst_desc, - cnnl_y_desc); - CALL_CNNL(cnnlTranspose_v2(cnnl_handle, trans_desc, cnnl_x_desc, input, - cnnl_y_desc, workspace_dst, transpose_workspace, - transpose_workspace_size)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_y_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - CALL_CNNL(cnnlDestroyTransposeDescriptor(trans_desc)); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpMaskedCol2imForward( - mluOpHandle_t handle, const mluOpTensorDescriptor_t col_desc, - const void *col, const mluOpTensorDescriptor_t mask_h_idx_desc, - const void *mask_h_idx, const mluOpTensorDescriptor_t mask_w_idx_desc, - const void *mask_w_idx, const size_t workspace_size, void *workspace, - const mluOpTensorDescriptor_t im_desc, void *im) { - mluOpStatus_t status = MLUOP_STATUS_BAD_PARAM; - PARAM_CHECK("[mluOpMaskedCol2imForward]", handle != NULL); - status = maskedCol2imForwardPreCheck(col_desc, mask_h_idx_desc, - mask_w_idx_desc, im_desc); - if (MLUOP_STATUS_SUCCESS != status) { - return status; - } - if (mluOpGetTensorElementNum(im_desc) == 0 || col_desc->dims[0] == 0) { - LOG(ERROR) << "[mluOpMaskedCol2imForward] Zero element tensor failure."; - return MLUOP_STATUS_BAD_PARAM; - } - if (mluOpGetTensorElementNum(mask_h_idx_desc) == 0) { - VLOG(5) << "[mluOpMaskedCol2imForward] Skip zero element tensor."; - uint64_t fill_value = 0x0; - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(im_desc, cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value, - cnnl_output_desc, im)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - return MLUOP_STATUS_SUCCESS; - } - if (workspace_size > 0) { - PARAM_CHECK("[mluOpMaskedCol2imForward]", workspace != NULL); - } - PARAM_CHECK("[mluOpMaskedCol2imForward]", col != NULL); - PARAM_CHECK("[mluOpMaskedCol2imForward]", mask_h_idx != NULL); - PARAM_CHECK("[mluOpMaskedCol2imForward]", mask_w_idx != NULL); - PARAM_CHECK("[mluOpMaskedCol2imForward]", im != NULL); - - // generate mluOpMaskedCol2imForward prototxt start! - if (MLUOP_GEN_CASE_ON_NEW) { - GEN_CASE_START("masked_col2im_forward"); - GEN_CASE_HANDLE(handle); - GEN_CASE_DATA(true, "col", col, col_desc, -10, 10); - GEN_CASE_DATA_REAL(true, "mask_h_idx", mask_h_idx, mask_h_idx_desc); - GEN_CASE_DATA_REAL(true, "mask_w_idx", mask_w_idx, mask_w_idx_desc); - GEN_CASE_DATA(false, "im", im, im_desc, 0, 0); - GEN_CASE_TEST_PARAM_NEW(false, false, true, 0, 0, 0); - } - // generate mluOpMaskedCol2imForward prototxt end! - mluOpDataType_t input_dtype = col_desc->dtype; - void *col_workspace = workspace; - void *im_workspace = (char *)workspace + col_desc->total_tensor_size; - void *transpose_workspace = (char *)im_workspace + im_desc->total_tensor_size; - - cnrtDim3_t k_dim; - cnrtFunctionType_t k_type; - const int mask_cnt = mask_h_idx_desc->dims[0]; - policyFunc(handle, mask_cnt, &k_dim, &k_type); - - VLOG(5) << "[mluOpMaskedCol2imForward] cnnlFill_v3 start."; - const int im_dim = im_desc->dim; - int NCHW2NHWC_permute[4] = {0, 2, 3, 1}; - int im_NHWC_dims[4] = {0, 0, 0, 0}; - for (int i = 0; i < im_dim; ++i) { - im_NHWC_dims[i] = im_desc->dims[NCHW2NHWC_permute[i]]; - } - mluOpTensorDescriptor_t im_NHWC_desc_tmp; - MLUOP_CHECK(mluOpCreateTensorDescriptor(&im_NHWC_desc_tmp)); - PARAM_CHECK( - "[mluOpMaskedCol2imForward]", - MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor(im_NHWC_desc_tmp, MLUOP_LAYOUT_ARRAY, - im_desc->dtype, im_dim, im_NHWC_dims)); - uint64_t fill_value = 0x0; - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(im_NHWC_desc_tmp, - cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value, - cnnl_output_desc, im_workspace)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - VLOG(5) << "[mluOpMaskedCol2imForward] cnnlFill_v3 end."; - - VLOG(5) << "[mluOpMaskedCol2imForward] cnnlTranspose_v2 col start."; - - int col_dim = col_desc->dim; - int col_permute[2] = {1, 0}; - int col_MC_dims[2] = {0, 0}; - col_MC_dims[0] = col_desc->dims[1]; - col_MC_dims[1] = col_desc->dims[0]; - mluOpTensorDescriptor_t col_MC_desc_tmp; - MLUOP_CHECK(mluOpCreateTensorDescriptor(&col_MC_desc_tmp)); - PARAM_CHECK( - "[mluOpMaskedCol2imForward]", - MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor(col_MC_desc_tmp, MLUOP_LAYOUT_ARRAY, - col_desc->dtype, col_dim, col_MC_dims)); - PARAM_CHECK( - "[mluOpMaskedCol2imForward]", - MLUOP_STATUS_SUCCESS == - transposeTensor(handle, col_desc, col, col_permute, col_MC_desc_tmp, - col_workspace, transpose_workspace)); - PARAM_CHECK( - "[mluOpMaskedCol2imForward]", - MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(col_MC_desc_tmp)); - VLOG(5) << "[mluOpMaskedCol2imForward] cnnlTranspose_v2 col end."; - - const int channels = im_desc->dims[1]; - const int height = im_desc->dims[2]; - const int width = im_desc->dims[3]; - VLOG(5) << "Launch kernel MLUUnion1MaskedCol2imForward<<<" << k_dim.x << ", " - << k_dim.y << ", " << k_dim.z << ">>>."; - CHECK_RETURN("[mluOpMaskedCol2imForward]", - KernelMaskedCol2imForward(k_dim, k_type, handle->queue, - input_dtype, col_workspace, height, - width, channels, mask_h_idx, - mask_w_idx, mask_cnt, im_workspace)); - VLOG(5) << "Finish launch MLUUnion1MaskedCol2imForward."; - - VLOG(5) << "[mluOpMaskedCol2imForward] cnnlTranspose_v2 im start."; - int im_permute[4] = {0, 3, 1, 2}; - PARAM_CHECK( - "[mluOpMaskedCol2imForward]", - MLUOP_STATUS_SUCCESS == transposeTensor(handle, im_NHWC_desc_tmp, - im_workspace, im_permute, im_desc, - im, transpose_workspace)); - PARAM_CHECK( - "[mluOpMaskedCol2imForward]", - MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(im_NHWC_desc_tmp)); - VLOG(5) << "[mluOpMaskedCol2imForward] cnnlTranspose_v2 im end."; - GEN_CASE_END(); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/masked_col2im_forward/masked_col2im_forward.h b/kernels/masked_col2im_forward/masked_col2im_forward.h deleted file mode 100644 index e6a7645f7..000000000 --- a/kernels/masked_col2im_forward/masked_col2im_forward.h +++ /dev/null @@ -1,35 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#ifndef KERNELS_MASKED_COL2IM_FORWARD_MASKED_COL2IM_FORWARD_H_ -#define KERNELS_MASKED_COL2IM_FORWARD_MASKED_COL2IM_FORWARD_H_ - -#include "mlu_op.h" - -// decare func -mluOpStatus_t MLUOP_WIN_API KernelMaskedCol2imForward( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - const mluOpDataType_t data_dtype, const void *col, const int height, - const int width, const int channels, const void *mask_h_idx, - const void *mask_w_idx, const int mask_cnt, void *im); - -#endif // KERNELS_MASKED_COL2IM_FORWARD_MASKED_COL2IM_FORWARD_H_ diff --git a/kernels/masked_col2im_forward/masked_col2im_forward_union1.mlu b/kernels/masked_col2im_forward/masked_col2im_forward_union1.mlu deleted file mode 100644 index cf0ce7b07..000000000 --- a/kernels/masked_col2im_forward/masked_col2im_forward_union1.mlu +++ /dev/null @@ -1,121 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "masked_col2im_forward.h" - -#include - -#include "core/logging.h" -#include "kernels/debug.h" -#include "kernels/kernel.h" -#include "kernels/utils/common.h" - -__nram__ char data_nram[MAX_NRAM_SIZE]; - -template -__mlu_func__ void MLUMultiKernelMaskedCol2imForward( - const T *col, const int height, const int width, const int channels, - const int32_t *mask_h_idx, const int32_t *mask_w_idx, const int mask_cnt, - T *im) { - const int channels_max_num_nram = MAX_NRAM_SIZE / sizeof(T); - if (channels <= channels_max_num_nram) { - const int deal_num = channels_max_num_nram / channels; - int mask_per_core = mask_cnt / taskDim; - const int mask_remain = mask_cnt % taskDim; - mask_per_core += taskId < mask_remain ? 1 : 0; - int index_start = taskId < mask_remain - ? taskId * mask_per_core - : taskId * mask_per_core + mask_remain; - int loop = mask_per_core / deal_num; - int remain_num = mask_per_core % deal_num; - T *nram_col = (T *)data_nram; - for (int index = 0; index < loop; ++index) { - int cur_index = index_start + index * deal_num; - __memcpy(nram_col, col + cur_index * channels, - deal_num * channels * sizeof(T), GDRAM2NRAM); - for (int i = 0; i < deal_num; ++i) { - int mask_index = cur_index + i; - const int h_im = mask_h_idx[mask_index]; - const int w_im = mask_w_idx[mask_index]; - __memcpy(im + (h_im * width + w_im) * channels, nram_col + i * channels, - channels * sizeof(T), NRAM2GDRAM); - } - } - if (remain_num > 0) { - int cur_index = index_start + loop * deal_num; - __memcpy(nram_col, col + cur_index * channels, - remain_num * channels * sizeof(T), GDRAM2NRAM); - for (int i = 0; i < remain_num; ++i) { - int mask_index = cur_index + i; - const int h_im = mask_h_idx[mask_index]; - const int w_im = mask_w_idx[mask_index]; - __memcpy(im + (h_im * width + w_im) * channels, nram_col + i * channels, - channels * sizeof(T), NRAM2GDRAM); - } - } - } else { - for (int index = taskId; index < mask_cnt; index += taskDim) { - const int m_index = index % mask_cnt; - const int h_im = mask_h_idx[m_index]; - const int w_im = mask_w_idx[m_index]; - __memcpy(im + (h_im * width + w_im) * channels, col + index * channels, - channels * sizeof(T), GDRAM2GDRAM); - } - } -} - -template -__mlu_entry__ void MLUUnion1MaskedCol2imForward( - const void *col, const int height, const int width, const int channels, - const void *mask_h_idx, const void *mask_w_idx, const int mask_cnt, - void *im) { - if (__is_mpu()) { - return; - } - MLUMultiKernelMaskedCol2imForward((T *)col, height, width, channels, - (int32_t *)mask_h_idx, - (int32_t *)mask_w_idx, mask_cnt, (T *)im); -} - -mluOpStatus_t MLUOP_WIN_API KernelMaskedCol2imForward( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - const mluOpDataType_t data_dtype, const void *col, const int height, - const int width, const int channels, const void *mask_h_idx, - const void *mask_w_idx, const int mask_cnt, void *im) { - switch (data_dtype) { - /* Only float and half data types are supported - in host-side CPP file fool-proof processing. */ - case MLUOP_DTYPE_FLOAT: { - KERNEL_CHECK( - MLUUnion1MaskedCol2imForward - <<>>(col, height, width, channels, mask_h_idx, - mask_w_idx, mask_cnt, im)); - }; break; - case MLUOP_DTYPE_HALF: { - KERNEL_CHECK(MLUUnion1MaskedCol2imForward<<>>( - col, height, width, channels, mask_h_idx, mask_w_idx, mask_cnt, im)); - }; break; - default: - break; - } - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/masked_im2col_forward/masked_im2col_forward.cpp b/kernels/masked_im2col_forward/masked_im2col_forward.cpp deleted file mode 100644 index 04373a125..000000000 --- a/kernels/masked_im2col_forward/masked_im2col_forward.cpp +++ /dev/null @@ -1,373 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "kernels/masked_im2col_forward/masked_im2col_forward.h" - -#include "core/gen_case.h" -#include "core/logging.h" -#include "core/runtime/device.h" -#include "kernels/kernel.h" -#include "kernels/utils/cnnl_helper.h" - -// policy function -static void policyFunc(const mluOpHandle_t handle, const int mask_cnt, - cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) { - const size_t cluster_limit = - mluop::runtime::getClusterLimitCapability(handle); - const size_t core_limit = - mluop::runtime::getCoreNumOfEachUnionCapability(handle); - const size_t task_dim = CEIL_ALIGN(mask_cnt, core_limit); - k_dim->x = core_limit; - k_dim->y = (task_dim / core_limit) > cluster_limit ? cluster_limit - : (task_dim / core_limit); - k_dim->z = 1; - *k_type = CNRT_FUNC_TYPE_UNION1; -} - -static mluOpStatus_t maskedIm2colForwardPreCheck( - const mluOpHandle_t handle, const mluOpTensorDescriptor_t feature_desc, - const mluOpTensorDescriptor_t mask_h_idx_desc, - const mluOpTensorDescriptor_t mask_w_idx_desc, - const mluOpTensorDescriptor_t data_col_desc, const int kernel_h, - const int kernel_w) { - PARAM_CHECK("[mluOpMaskedIm2colForward]", handle != NULL); - PARAM_CHECK("[mluOpMaskedIm2colForward]", feature_desc != NULL); - PARAM_CHECK("[mluOpMaskedIm2colForward]", mask_h_idx_desc != NULL); - PARAM_CHECK("[mluOpMaskedIm2colForward]", mask_w_idx_desc != NULL); - PARAM_CHECK("[mluOpMaskedIm2colForward]", data_col_desc != NULL); - - PARAM_CHECK("[mluOpMaskedIm2colForward]", - feature_desc->layout == MLUOP_LAYOUT_NCHW); - PARAM_CHECK("[mluOpMaskedIm2colForward]", feature_desc->dim == 4); - PARAM_CHECK("[mluOpMaskedIm2colForward]", - feature_desc->dtype == MLUOP_DTYPE_FLOAT || - feature_desc->dtype == MLUOP_DTYPE_HALF); - PARAM_CHECK("[mluOpMaskedIm2colForward]", - feature_desc->dtype == data_col_desc->dtype); - PARAM_CHECK("[mluOpMaskedIm2colForward]", - mask_h_idx_desc->dtype == MLUOP_DTYPE_INT32); - PARAM_CHECK("[mluOpMaskedIm2colForward]", - mask_w_idx_desc->dtype == MLUOP_DTYPE_INT32); - PARAM_CHECK("[mluOpMaskedIm2colForward]", feature_desc->dims[0] == 1); - PARAM_CHECK("[mluOpMaskedIm2colForward]", mask_h_idx_desc->dim == 1); - PARAM_CHECK("[mluOpMaskedIm2colForward]", mask_w_idx_desc->dim == 1); - PARAM_CHECK("[mluOpMaskedIm2colForward]", - mask_h_idx_desc->dims[0] == mask_w_idx_desc->dims[0]); - PARAM_CHECK("[mluOpMaskedIm2colForward]", data_col_desc->dim == 2); - PARAM_CHECK( - "[mluOpMaskedIm2colForward]", - data_col_desc->dims[0] == feature_desc->dims[1] * kernel_h * kernel_w); - PARAM_CHECK("[mluOpMaskedIm2colForward]", - data_col_desc->dims[1] == mask_h_idx_desc->dims[0]); - PARAM_CHECK("[mluOpMaskedIm2colForward]", kernel_h > 0); - PARAM_CHECK("[mluOpMaskedIm2colForward]", kernel_w > 0); - - const uint64_t feature_element_num = mluOpGetTensorElementNum(feature_desc); - const uint64_t mask_h_idx_element_num = - mluOpGetTensorElementNum(mask_h_idx_desc); - const uint64_t data_col_element_num = mluOpGetTensorElementNum(data_col_desc); - TENSOR_NUM_CHECK("[mluOpMaskedIm2colForward]", feature_element_num, - LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK("[mluOpMaskedIm2colForward]", mask_h_idx_element_num, - LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK("[mluOpMaskedIm2colForward]", data_col_element_num, - LARGE_TENSOR_NUM, ""); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpGetMaskedIm2colForwardWorkspaceSize( - mluOpHandle_t handle, const mluOpTensorDescriptor_t feature_desc, - const mluOpTensorDescriptor_t mask_h_idx_desc, - const mluOpTensorDescriptor_t mask_w_idx_desc, const int kernel_h, - const int kernel_w, const mluOpTensorDescriptor_t data_col_desc, - size_t *workspace_size) { - mluOpStatus_t status = MLUOP_STATUS_BAD_PARAM; - PARAM_CHECK("[mluOpMaskedIm2colForward]", workspace_size != NULL); - status = maskedIm2colForwardPreCheck(handle, feature_desc, mask_h_idx_desc, - mask_w_idx_desc, data_col_desc, kernel_h, - kernel_w); - if (MLUOP_STATUS_SUCCESS != status) { - return status; - } - if (mluOpGetTensorElementNum(feature_desc) == 0 || - data_col_desc->dims[0] == 0) { - LOG(ERROR) << "[mluOpMaskedIm2colForward] Zero element tensor failure."; - return MLUOP_STATUS_BAD_PARAM; - } - if (mluOpGetTensorElementNum(mask_h_idx_desc) == 0) { - VLOG(5) << "[mluOpMaskedIm2colForward] Skip zero element tensor."; - return MLUOP_STATUS_SUCCESS; - } - *workspace_size = feature_desc->total_tensor_size; - *workspace_size += data_col_desc->total_tensor_size; - - cnnlTransposeDescriptor_t trans_desc; - size_t feature_transpose_workspace_size = 0; - int feature_dim = feature_desc->dim; - int feature_permute[4] = {0, 3, 1, 2}; - - CALL_CNNL(cnnlCreateTransposeDescriptor(&trans_desc)); - CALL_CNNL( - cnnlSetTransposeDescriptor(trans_desc, feature_dim, feature_permute)); - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(feature_desc, cnnl_x_desc); - CALL_CNNL(cnnlGetTransposeWorkspaceSize(cnnl_handle, cnnl_x_desc, - trans_desc, - &feature_transpose_workspace_size)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - if (mluOpGetTensorElementNum(feature_desc) == 0 || - data_col_desc->dims[0] == 0) { - VLOG(5) << "[mluOpMaskedIm2colForward] Zero element tensor failure."; - return MLUOP_STATUS_BAD_PARAM; - } - int data_col_dim = 3; - int data_col_permute[3] = {2, 1, 0}; - int data_col_HWC_dims[3] = {0, 0, 0}; - int data_col_CHW_dims[3] = {0, 0, 0}; - data_col_HWC_dims[0] = mask_h_idx_desc->dims[0]; - data_col_HWC_dims[1] = kernel_h * kernel_w; - data_col_HWC_dims[2] = feature_desc->dims[1]; - for (int i = 0; i < data_col_dim; ++i) { - data_col_CHW_dims[i] = data_col_HWC_dims[data_col_permute[i]]; - } - size_t data_col_transpose_workspace_size = 0; - mluOpTensorDescriptor_t data_col_HWC_desc_tmp; - MLUOP_CHECK(mluOpCreateTensorDescriptor(&data_col_HWC_desc_tmp)); - - PARAM_CHECK("[mluOpMaskedIm2colForward]", - MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor( - data_col_HWC_desc_tmp, MLUOP_LAYOUT_ARRAY, - feature_desc->dtype, data_col_dim, data_col_HWC_dims)); - CALL_CNNL( - cnnlSetTransposeDescriptor(trans_desc, data_col_dim, data_col_permute)); - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(data_col_HWC_desc_tmp, - cnnl_x_desc); - CALL_CNNL( - cnnlGetTransposeWorkspaceSize(cnnl_handle, cnnl_x_desc, trans_desc, - &data_col_transpose_workspace_size)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - *workspace_size += - data_col_transpose_workspace_size > feature_transpose_workspace_size - ? data_col_transpose_workspace_size - : feature_transpose_workspace_size; - PARAM_CHECK("[mluOpMaskedIm2colForward]", - MLUOP_STATUS_SUCCESS == - mluOpDestroyTensorDescriptor(data_col_HWC_desc_tmp)); - CALL_CNNL(cnnlDestroyTransposeDescriptor(trans_desc)); - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t transposeTensor( - mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, - const void *input, const int *permute, - const mluOpTensorDescriptor_t workspace_dst_desc, void *workspace_dst, - void *transpose_workspace) { - int input_dim = input_desc->dim; - cnnlTransposeDescriptor_t trans_desc; - size_t transpose_workspace_size = 0; - CALL_CNNL(cnnlCreateTransposeDescriptor(&trans_desc)); - CALL_CNNL(cnnlSetTransposeDescriptor(trans_desc, input_dim, permute)); - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_x_desc); - CALL_CNNL(cnnlGetTransposeWorkspaceSize( - cnnl_handle, cnnl_x_desc, trans_desc, &transpose_workspace_size)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_x_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(workspace_dst_desc, - cnnl_y_desc); - CALL_CNNL(cnnlTranspose_v2(cnnl_handle, trans_desc, cnnl_x_desc, input, - cnnl_y_desc, workspace_dst, transpose_workspace, - transpose_workspace_size)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_y_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - CALL_CNNL(cnnlDestroyTransposeDescriptor(trans_desc)); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpMaskedIm2colForward( - mluOpHandle_t handle, const mluOpTensorDescriptor_t feature_desc, - const void *feature, const mluOpTensorDescriptor_t mask_h_idx_desc, - const void *mask_h_idx, const mluOpTensorDescriptor_t mask_w_idx_desc, - const void *mask_w_idx, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, void *workspace, - const size_t workspace_size, const mluOpTensorDescriptor_t data_col_desc, - void *data_col) { - mluOpStatus_t status = MLUOP_STATUS_BAD_PARAM; - status = maskedIm2colForwardPreCheck(handle, feature_desc, mask_h_idx_desc, - mask_w_idx_desc, data_col_desc, kernel_h, - kernel_w); - if (MLUOP_STATUS_SUCCESS != status) { - return status; - } - - if (mluOpGetTensorElementNum(feature_desc) == 0 || - data_col_desc->dims[0] == 0) { - LOG(ERROR) << "[mluOpMaskedIm2colForward] Zero element tensor failure."; - return MLUOP_STATUS_BAD_PARAM; - } - if (mluOpGetTensorElementNum(mask_h_idx_desc) == 0) { - VLOG(5) << "[mluOpMaskedIm2colForward] Skip zero element tensor."; - return MLUOP_STATUS_SUCCESS; - } - if (workspace_size > 0) { - PARAM_CHECK("[mluOpMaskedIm2colForward]", workspace != NULL); - } - PARAM_CHECK("[mluOpMaskedIm2colForward]", feature != NULL); - PARAM_CHECK("[mluOpMaskedIm2colForward]", mask_h_idx != NULL); - PARAM_CHECK("[mluOpMaskedIm2colForward]", mask_w_idx != NULL); - PARAM_CHECK("[mluOpMaskedIm2colForward]", data_col != NULL); - - // generate mluOpMaskedIm2colForward prototxt start! - if (MLUOP_GEN_CASE_ON_NEW) { - GEN_CASE_START("masked_im2col_forward"); - GEN_CASE_HANDLE(handle); - GEN_CASE_DATA(true, "feature", feature, feature_desc, -10, 10); - GEN_CASE_DATA_REAL(true, "mask_h_idx", mask_h_idx, mask_h_idx_desc); - GEN_CASE_DATA_REAL(true, "mask_w_idx", mask_w_idx, mask_w_idx_desc); - GEN_CASE_DATA(false, "data_col", data_col, data_col_desc, 0, 0); - GEN_CASE_OP_PARAM_SINGLE(0, "masked_im2col_forward", "kernel_h", kernel_h); - GEN_CASE_OP_PARAM_SINGLE(1, "masked_im2col_forward", "kernel_w", kernel_w); - GEN_CASE_OP_PARAM_SINGLE(1, "masked_im2col_forward", "pad_h", pad_h); - GEN_CASE_OP_PARAM_SINGLE(2, "masked_im2col_forward", "pad_w", pad_w); - GEN_CASE_TEST_PARAM_NEW(false, false, true, 0, 0, 0); - } - // generate mluOpMaskedIm2colForward prototxt end! - mluOpDataType_t input_dtype = feature_desc->dtype; - void *feature_workspace = workspace; - void *data_col_workspace = - (char *)workspace + feature_desc->total_tensor_size; - void *transpose_workspace = - (char *)data_col_workspace + data_col_desc->total_tensor_size; - - cnrtDim3_t k_dim; - cnrtFunctionType_t k_type; - const int mask_cnt = mask_h_idx_desc->dims[0]; - policyFunc(handle, mask_cnt, &k_dim, &k_type); - - VLOG(5) << "[mluOpMaskedIm2colForward] cnnlFill_v3 start."; - uint64_t fill_value = 0x0; - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(data_col_desc, - cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value, - cnnl_output_desc, data_col_workspace)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - - VLOG(5) << "[mluOpMaskedIm2colForward] cnnlTranspose_v2 feature start."; - - int feature_dim = feature_desc->dim; - int feature_permute[4] = {0, 2, 3, 1}; - int feature_tmp_dims[4] = {0, 0, 0, 0}; - - for (int i = 0; i < feature_dim; ++i) { - feature_tmp_dims[i] = feature_desc->dims[feature_permute[i]]; - } - - mluOpTensorDescriptor_t feature_desc_tmp; - MLUOP_CHECK(mluOpCreateTensorDescriptor(&feature_desc_tmp)); - PARAM_CHECK( - "[mluOpMaskedIm2colForward]", - MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor(feature_desc_tmp, MLUOP_LAYOUT_ARRAY, - input_dtype, feature_dim, feature_tmp_dims)); - PARAM_CHECK("[mluOpMaskedIm2colForward]", - MLUOP_STATUS_SUCCESS == - transposeTensor(handle, feature_desc, feature, - feature_permute, feature_desc_tmp, - feature_workspace, transpose_workspace)); - - PARAM_CHECK( - "[mluOpMaskedIm2colForward]", - MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(feature_desc_tmp)); - - const int channels = feature_desc->dims[1]; - const int height = feature_desc->dims[2]; - const int width = feature_desc->dims[3]; - VLOG(5) << "Launch kernel MLUUnion1MaskedIm2colForward<<<" << k_dim.x << ", " - << k_dim.y << ", " << k_dim.z << ">>>."; - CHECK_RETURN("[mluOpMaskedIm2colForward]", - KernelMaskedIm2colForward( - k_dim, k_type, handle->queue, input_dtype, feature_workspace, - height, width, channels, kernel_h, kernel_w, pad_h, pad_w, - mask_h_idx, mask_w_idx, mask_cnt, data_col_workspace)); - - VLOG(5) << "[mluOpMaskedIm2colForward] cnnlTranspose_v2 data_col start."; - const int data_col_dim = 3; - int data_col_permute[3] = {2, 1, 0}; - int data_col_HWC_dims[3] = {0, 0, 0}; - int data_col_CHW_dims[3] = {0, 0, 0}; - data_col_HWC_dims[0] = mask_cnt; - data_col_HWC_dims[1] = kernel_h * kernel_w; - data_col_HWC_dims[2] = channels; - for (int i = 0; i < data_col_dim; ++i) { - data_col_CHW_dims[i] = data_col_HWC_dims[data_col_permute[i]]; - } - - mluOpTensorDescriptor_t data_col_HWC_desc_tmp; - mluOpTensorDescriptor_t data_col_CHW_desc_tmp; - MLUOP_CHECK(mluOpCreateTensorDescriptor(&data_col_HWC_desc_tmp)); - MLUOP_CHECK(mluOpCreateTensorDescriptor(&data_col_CHW_desc_tmp)); - - PARAM_CHECK("[mluOpMaskedIm2colForward]", - MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor(data_col_HWC_desc_tmp, - MLUOP_LAYOUT_ARRAY, input_dtype, - data_col_dim, data_col_HWC_dims)); - PARAM_CHECK("[mluOpMaskedIm2colForward]", - MLUOP_STATUS_SUCCESS == - mluOpSetTensorDescriptor(data_col_CHW_desc_tmp, - MLUOP_LAYOUT_ARRAY, input_dtype, - data_col_dim, data_col_CHW_dims)); - - PARAM_CHECK( - "[mluOpMaskedIm2colForward]", - MLUOP_STATUS_SUCCESS == - transposeTensor(handle, data_col_HWC_desc_tmp, data_col_workspace, - data_col_permute, data_col_CHW_desc_tmp, data_col, - transpose_workspace)); - PARAM_CHECK("[mluOpMaskedIm2colForward]", - MLUOP_STATUS_SUCCESS == - mluOpDestroyTensorDescriptor(data_col_HWC_desc_tmp)); - PARAM_CHECK("[mluOpMaskedIm2colForward]", - MLUOP_STATUS_SUCCESS == - mluOpDestroyTensorDescriptor(data_col_CHW_desc_tmp)); - GEN_CASE_END(); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/masked_im2col_forward/masked_im2col_forward.h b/kernels/masked_im2col_forward/masked_im2col_forward.h deleted file mode 100644 index 30440106a..000000000 --- a/kernels/masked_im2col_forward/masked_im2col_forward.h +++ /dev/null @@ -1,36 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#ifndef KERNELS_MASKED_IM2COL_FORWARD_MASKED_IM2COL_FORWARD_H_ -#define KERNELS_MASKED_IM2COL_FORWARD_MASKED_IM2COL_FORWARD_H_ - -#include "mlu_op.h" - -// decare func -mluOpStatus_t MLUOP_WIN_API KernelMaskedIm2colForward( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - const mluOpDataType_t data_dtype, const void *feature, const int height, - const int width, const int channels, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const void *mask_h_idx, - const void *mask_w_idx, const int mask_cnt, void *data_col); - -#endif // KERNELS_MASKED_IM2COL_FORWARD_MASKED_IM2COL_FORWARD_H_ diff --git a/kernels/masked_im2col_forward/masked_im2col_forward_union1.mlu b/kernels/masked_im2col_forward/masked_im2col_forward_union1.mlu deleted file mode 100644 index 3d4203227..000000000 --- a/kernels/masked_im2col_forward/masked_im2col_forward_union1.mlu +++ /dev/null @@ -1,100 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "kernels/masked_im2col_forward/masked_im2col_forward.h" - -#include "core/logging.h" -#include "kernels/utils/common.h" - -template -__mlu_func__ void MLUMultiKernelMaskedIm2colForward( - const T *feature, const int height, const int width, const int channels, - const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, - const int32_t *mask_h_idx, const int32_t *mask_w_idx, const int mask_cnt, - T *data_col) { - for (int index = taskId; index < mask_cnt; index += taskDim) { - const int h_col = mask_h_idx[index]; - const int w_col = mask_w_idx[index]; - const int h_offset = h_col - pad_h; - const int w_offset = w_col - pad_w; - int h_start = h_offset; - int h_end = h_offset + kernel_h - 1; - int w_start = w_offset; - int w_end = w_start + kernel_w - 1; - if (h_start >= height || w_start >= width || h_end < 0 || w_end < 0) { - continue; - } else { - int h_start_valid = __mluop_max(0, h_start); - int h_end_valid = __mluop_min(height - 1, h_end); - int w_start_valid = __mluop_max(0, w_start); - int w_end_valid = __mluop_min(width - 1, w_end); - __memcpy( - data_col + index * kernel_h * kernel_w * channels + - ((h_start_valid - h_start) * kernel_w + - (w_start_valid - w_start)) * - channels, - feature + h_start_valid * width * channels + w_start_valid * channels, - (w_end_valid - w_start_valid + 1) * channels * sizeof(T), GDRAM2GDRAM, - kernel_w * channels * sizeof(T), width * channels * sizeof(T), - h_end_valid - h_start_valid); - } - } -} - -__mlu_entry__ void MLUUnion1MaskedIm2colForward( - const mluOpDataType_t data_dtype, const void *feature, const int height, - const int width, const int channels, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const void *mask_h_idx, - const void *mask_w_idx, const int mask_cnt, void *data_col) { - if (__is_mpu()) { - return; - } - - switch (data_dtype) { - case MLUOP_DTYPE_HALF: { - MLUMultiKernelMaskedIm2colForward( - (half *)feature, height, width, channels, kernel_h, kernel_w, pad_h, - pad_w, (int32_t *)mask_h_idx, (int32_t *)mask_w_idx, mask_cnt, - (half *)data_col); - }; break; - case MLUOP_DTYPE_FLOAT: { - MLUMultiKernelMaskedIm2colForward( - (float *)feature, height, width, channels, kernel_h, kernel_w, pad_h, - pad_w, (int32_t *)mask_h_idx, (int32_t *)mask_w_idx, mask_cnt, - (float *)data_col); - }; break; - default: - break; - } -} - -mluOpStatus_t MLUOP_WIN_API KernelMaskedIm2colForward( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - const mluOpDataType_t data_dtype, const void *feature, const int height, - const int width, const int channels, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const void *mask_h_idx, - const void *mask_w_idx, const int mask_cnt, void *data_col) { - KERNEL_CHECK(MLUUnion1MaskedIm2colForward<<>>( - data_dtype, feature, height, width, channels, kernel_h, kernel_w, pad_h, - pad_w, mask_h_idx, mask_w_idx, mask_cnt, data_col)); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp b/kernels/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp deleted file mode 100644 index b270359aa..000000000 --- a/kernels/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp +++ /dev/null @@ -1,220 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "moe_dispatch_backward_data.h" - -#include - -#include "core/context.h" -#include "core/gen_case.h" -#include "core/logging.h" -#include "core/runtime/device.h" -#include "core/tensor.h" -#include "core/type.h" -#include "kernels/utils/cnnl_helper.h" - -// policy function -static void PolicyFunc(const mluOpHandle_t handle, cnrtDim3_t *k_dim, - cnrtFunctionType_t *k_type) { - // union1 policy func - *k_type = CNRT_FUNC_TYPE_UNION1; - // dimx equals to num of MLU Cores in each cluster - k_dim->x = mluop::runtime::getCoreNumOfEachUnionCapability(handle); - // dimy equals to num of current available clusters - k_dim->y = mluop::runtime::getClusterLimitCapability(handle); - k_dim->z = 1; -} - -mluOpStatus_t MLUOP_WIN_API mluOpMoeDispatchBackwardData( - mluOpHandle_t handle, const mluOpTensorDescriptor_t gates_desc, - const void *gates, const mluOpTensorDescriptor_t indices_desc, - const void *indices, const mluOpTensorDescriptor_t locations_desc, - const void *locations, const mluOpTensorDescriptor_t dispatch_desc, - const void *dispatch, const int samples, const int capacity, - const int hidden, const int num_experts, - const mluOpTensorDescriptor_t grad_input_desc, void *grad_input) { - // gates: (samples) - // indices: (samples) - // locations: (samples) - // dispatch: (num_experts * capacity, hidden) - // grad_input: (samples, hidden) - - const std::string API = "[mluOpMoeDispatchBackwardData]"; - // check desc - PARAM_CHECK(API, handle != NULL); - // check arch - if (handle->arch < MLUOP_MLU370) { - LOG(ERROR) << API - << "The operator does not match the current architecture."; - return MLUOP_STATUS_ARCH_MISMATCH; - } - PARAM_CHECK(API, gates_desc != NULL); - PARAM_CHECK(API, indices_desc != NULL); - PARAM_CHECK(API, locations_desc != NULL); - PARAM_CHECK(API, dispatch_desc != NULL); - PARAM_CHECK(API, grad_input_desc != NULL); - - // check dim - PARAM_CHECK_EQ(API, gates_desc->dim, 1); - PARAM_CHECK_EQ(API, indices_desc->dim, 1); - PARAM_CHECK_EQ(API, locations_desc->dim, 1); - PARAM_CHECK_EQ(API, dispatch_desc->dim, 2); - PARAM_CHECK_EQ(API, grad_input_desc->dim, 2); - - // check shape - PARAM_CHECK_EQ(API, gates_desc->dims[0], samples); - PARAM_CHECK_EQ(API, indices_desc->dims[0], samples); - PARAM_CHECK_EQ(API, locations_desc->dims[0], samples); - PARAM_CHECK_EQ(API, dispatch_desc->dims[0], (num_experts * capacity)); - PARAM_CHECK_EQ(API, dispatch_desc->dims[1], hidden); - PARAM_CHECK_EQ(API, grad_input_desc->dims[0], samples); - PARAM_CHECK_EQ(API, grad_input_desc->dims[1], hidden); - - // check dtype - PARAM_CHECK_V2(API, (gates_desc->dtype == MLUOP_DTYPE_FLOAT), - "Only float are supported in input tensor, but the " - "data type of tensor is " - << mluOpGetNameOfDataType(gates_desc->dtype) << "."); - PARAM_CHECK_V2(API, (indices_desc->dtype == MLUOP_DTYPE_INT32), - "Only int32 are supported in indices tensor, but the data " - "type of tensor is " - << mluOpGetNameOfDataType(indices_desc->dtype) << "."); - PARAM_CHECK_V2(API, (locations_desc->dtype == MLUOP_DTYPE_INT32), - "Only int32 are supported in locations tensor, but the data " - "type of tensor is " - << mluOpGetNameOfDataType(locations_desc->dtype) << "."); - PARAM_CHECK(API, dispatch_desc->dtype == gates_desc->dtype); - PARAM_CHECK(API, grad_input_desc->dtype == gates_desc->dtype); - - // check tensor dim - PARAM_CHECK(API, samples >= 0); - PARAM_CHECK(API, capacity >= 0); - PARAM_CHECK(API, hidden >= 0); - PARAM_CHECK(API, num_experts >= 0); - - const uint64_t gates_element_num = mluOpGetTensorElementNum(gates_desc); - const uint64_t indices_element_num = mluOpGetTensorElementNum(indices_desc); - const uint64_t locations_element_num = - mluOpGetTensorElementNum(locations_desc); - const uint64_t dispatch_element_num = mluOpGetTensorElementNum(dispatch_desc); - const uint64_t grad_input_element_num = - mluOpGetTensorElementNum(grad_input_desc); - - // check large tensor - TENSOR_NUM_CHECK(API, gates_element_num, LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK(API, indices_element_num, LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK(API, locations_element_num, LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK(API, dispatch_element_num, LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK(API, grad_input_element_num, LARGE_TENSOR_NUM, ""); - - // check zero element - if (samples == 0 || hidden == 0) { - VLOG(5) << API << "Skip zero element tensor."; - return MLUOP_STATUS_SUCCESS; - } else { - // Initialize output space - PARAM_CHECK(API, grad_input != NULL); - const size_t grad_input_initial_value = 0x00; - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_input_desc, - cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, - &grad_input_initial_value, cnnl_output_desc, - grad_input)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - VLOG(5) << API << "Initialize output tensor done."; - } - - // check zero element - if (capacity == 0 || num_experts == 0) { - VLOG(5) << API << "Skip zero element tensor."; - return MLUOP_STATUS_SUCCESS; - } - - // check ptr - PARAM_CHECK(API, gates != NULL); - PARAM_CHECK(API, indices != NULL); - PARAM_CHECK(API, locations != NULL); - PARAM_CHECK(API, dispatch != NULL); - PARAM_CHECK(API, grad_input != NULL); - - VLOG(5) << API << "input data shape: " - << "samples = " << samples << ", " - << "capacity = " << capacity << ", " - << "hidden = " << hidden << ", " - << "num_experts = " << num_experts; - - // generate prototxt start! - if (MLUOP_GEN_CASE_ON_NEW) { - GEN_CASE_START("moe_dispatch_backward_data"); - GEN_CASE_HANDLE(handle); - GEN_CASE_DATA(true, "gates", gates, gates_desc, 100, -100); - GEN_CASE_DATA_REAL(true, "indices", indices, indices_desc); - GEN_CASE_DATA_REAL(true, "locations", locations, locations_desc); - GEN_CASE_DATA(true, "dispatch", dispatch, dispatch_desc, 100, -100); - GEN_CASE_DATA(false, "grad_input", grad_input, grad_input_desc, 0, 0); - GEN_CASE_OP_PARAM_SINGLE(0, "moe_dispatch_backward_data", "samples", - samples); - GEN_CASE_OP_PARAM_SINGLE(1, "moe_dispatch_backward_data", "capacity", - capacity); - GEN_CASE_OP_PARAM_SINGLE(2, "moe_dispatch_backward_data", "hidden", hidden); - GEN_CASE_OP_PARAM_SINGLE(3, "moe_dispatch_backward_data", "num_experts", - num_experts); - GEN_CASE_TEST_PARAM_NEW(false, false, true, 0, 0, 0.0); - } - // generate prototxt end! - - cnrtDim3_t k_dim; - cnrtFunctionType_t k_type; - PolicyFunc(handle, &k_dim, &k_type); - - int core_num_per_cluster = - mluop::runtime::getCoreNumOfEachUnionCapability(handle); - VLOG(5) << API << "Launch Kernel <<>>" - << "core num per cluster: " << core_num_per_cluster; - - mluOpDataType_t data_type = grad_input_desc->dtype; - uint32_t taskNum = k_dim.x * k_dim.y * k_dim.z; - - if (samples <= taskNum) { - VLOG(5) << API << "Launch Kernel KernelMoeDispatchBwdData1()."; - CHECK_RETURN( - "[mluOpMoeDispatchBackwardData1]", - KernelMoeDispatchBwdData1(k_dim, k_type, handle->queue, data_type, - gates, indices, locations, dispatch, samples, - capacity, hidden, num_experts, grad_input)); - VLOG(5) << API << "Finish Kernel KernelMoeDispatchBwdData1."; - } else { - VLOG(5) << API << "Launch Kernel KernelMoeDispatchBwdData2()."; - CHECK_RETURN( - "[mluOpMoeDispatchBackwardData2]", - KernelMoeDispatchBwdData2(k_dim, k_type, handle->queue, data_type, - gates, indices, locations, dispatch, samples, - capacity, hidden, num_experts, grad_input)); - VLOG(5) << API << "Finish Kernel KernelMoeDispatchBwdData2."; - } - - GEN_CASE_END(); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/moe_dispatch_backward_data/moe_dispatch_backward_data.h b/kernels/moe_dispatch_backward_data/moe_dispatch_backward_data.h deleted file mode 100644 index b392cc217..000000000 --- a/kernels/moe_dispatch_backward_data/moe_dispatch_backward_data.h +++ /dev/null @@ -1,42 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#ifndef KERNELS_MOE_DISPATCH_BACKWARD_DATA_MOE_DISPATCH_BACKWARD_DATA_H -#define KERNELS_MOE_DISPATCH_BACKWARD_DATA_MOE_DISPATCH_BACKWARD_DATA_H - -#include "mlu_op.h" - -mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchBwdData1( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t d_type, const void *gates, const void *indices, - const void *locations, const void *dispatch, const int samples, - const int capacity, const int hidden, const int num_experts, - void *grad_input); - -mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchBwdData2( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t d_type, const void *gates, const void *indices, - const void *locations, const void *dispatch, const int samples, - const int capacity, const int hidden, const int num_experts, - void *grad_input); - -#endif // KERNELS_MOE_DISPATCH_BACKWARD_DATA_MOE_DISPATCH_BACKWARD_DATA_H diff --git a/kernels/moe_dispatch_backward_data/moe_dispatch_backward_data_union1.mlu b/kernels/moe_dispatch_backward_data/moe_dispatch_backward_data_union1.mlu deleted file mode 100644 index 4d2e9d08f..000000000 --- a/kernels/moe_dispatch_backward_data/moe_dispatch_backward_data_union1.mlu +++ /dev/null @@ -1,339 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "moe_dispatch_backward_data.h" - -#include "core/logging.h" -#include "kernels/debug.h" -#include "kernels/kernel.h" -#include "kernels/utils/common.h" - -__nram__ char nram_buffer[MAX_NRAM_SIZE]; - -#if __BANG_ARCH__ >= 372 -template -static __mlu_func__ void load(T *dispatch_addr, T *nram_dispatch, - const int deal_num, const int pingpong_num, - const int pi) { - int offset = (pi % 2) * pingpong_num; - T *nram_dispatch_p = nram_dispatch + offset; - __memcpy_async(nram_dispatch_p, dispatch_addr, deal_num * sizeof(T), - GDRAM2NRAM); -} - -template -static __mlu_func__ void compute(T *nram_gard_input, T *nram_dispatch, - const T gates_value, const int deal_num, - const int pingpong_num, const int pi) { - int offset = (pi % 2) * pingpong_num; - T *nram_gard_input_p = nram_gard_input + offset; - T *nram_dispatch_p = nram_dispatch + offset; - __bang_mul_scalar(nram_gard_input_p, nram_dispatch_p, gates_value, deal_num); -} - -template -static __mlu_func__ void store(T *grad_input_addr, T *nram_grad_input, - const int deal_num, const int pingpong_num, - const int pi) { - int offset = (pi % 2) * pingpong_num; - T *nram_grad_input_p = nram_grad_input + offset; - __memcpy_async(grad_input_addr, nram_grad_input_p, deal_num * sizeof(T), - NRAM2GDRAM); -} - -template -static __mlu_func__ void lcs(T *base_gard_input_addr, T *base_dispatch_addr, - T *nram_gard_input, T *nram_dispatch, - const T gates_value, const int repeat_num, - const int rem_num, const int deal_num, - const int pingpong_num) { - if (repeat_num > 0) { - // L[0] - T *dispatch_addr = base_dispatch_addr; - load(dispatch_addr, nram_dispatch, deal_num, pingpong_num, 0); - __sync(); - } - if (repeat_num > 1) { - // L[1] - T *dispatch_addr = base_dispatch_addr + deal_num; - load(dispatch_addr, nram_dispatch, deal_num, pingpong_num, 1); - // C[0] - compute(nram_gard_input, nram_dispatch, gates_value, deal_num, pingpong_num, - 0); - __sync(); - } - for (int n_iter = 0; n_iter < repeat_num - 2; ++n_iter) { - // S[n_iter] - T *gard_input_addr = base_gard_input_addr + n_iter * deal_num; - store(gard_input_addr, nram_gard_input, deal_num, pingpong_num, n_iter); - // L[n_iter + 2] - T *dispatch_addr = base_dispatch_addr + (n_iter + 2) * deal_num; - load(dispatch_addr, nram_dispatch, deal_num, pingpong_num, n_iter + 2); - // C[n_iter + 1] - compute(nram_gard_input, nram_dispatch, gates_value, deal_num, pingpong_num, - n_iter + 1); - __sync(); - } - if (repeat_num >= 2) { - // S[repeat_num - 2] - T *gard_input_addr = base_gard_input_addr + (repeat_num - 2) * deal_num; - store(gard_input_addr, nram_gard_input, deal_num, pingpong_num, - repeat_num - 2); - } - if (rem_num > 0) { - // L[repeat_num] - T *dispatch_addr = base_dispatch_addr + repeat_num * deal_num; - load(dispatch_addr, nram_dispatch, rem_num, pingpong_num, repeat_num); - } - if (repeat_num > 0) { - // C[repeat_num - 1] - compute(nram_gard_input, nram_dispatch, gates_value, deal_num, pingpong_num, - repeat_num - 1); - } - __sync(); - if (repeat_num > 0) { - // S[repeat_num - 1] - T *gard_input_addr = base_gard_input_addr + (repeat_num - 1) * deal_num; - store(gard_input_addr, nram_gard_input, deal_num, pingpong_num, - repeat_num - 1); - } - if (rem_num > 0) { - // C[repeat_num] - compute(nram_gard_input, nram_dispatch, gates_value, rem_num, pingpong_num, - repeat_num); - __sync(); - // S[repeat_num] - T *gard_input_addr = base_gard_input_addr + repeat_num * deal_num; - store(gard_input_addr, nram_gard_input, rem_num, pingpong_num, repeat_num); - } -} -#endif - -template -__mlu_entry__ void MLUKernelMoeDispatchBwdData1( - const T *gates, const int *indices, const int *locations, const T *dispatch, - const int samples, const int capacity, const int hidden, - const int num_experts, T *grad_input) { - // gates: (samples) - // indices: (samples) - // locations: (samples) - // dispatch: (num_experts * capacity, hidden) - // grad_input: (samples, hidden) -#if __BANG_ARCH__ >= 372 - if (__is_mpu()) { - return; - } - int one_sample_task_num = taskDim / samples; - int rem_task = taskDim % samples; - int sample_idx = 0; - if ((rem_task > 0) && (taskId < (one_sample_task_num + 1) * rem_task)) { - sample_idx = (int)(taskId / (one_sample_task_num + 1)); - one_sample_task_num = one_sample_task_num + 1; - } else { - sample_idx = (int)((taskId - rem_task) / one_sample_task_num); - } - int indices_value = indices[sample_idx]; - int location_value = locations[sample_idx]; - if (indices_value < 0 || indices_value >= num_experts || location_value < 0 || - location_value >= capacity) { - return; - } - T gates_si_value = gates[sample_idx]; - int logic_tid = taskId % one_sample_task_num; - int hidden_per_task = hidden / one_sample_task_num; - int rem_hidden_num = hidden % one_sample_task_num; - int hidden_seg_num = hidden_per_task + (int)(logic_tid < rem_hidden_num); - if (hidden_seg_num == 0) { - return; - } - int hidden_data_offset = - logic_tid * hidden_per_task + - ((logic_tid < rem_hidden_num) ? logic_tid : rem_hidden_num); - // | nram space partion | data num | - // | ------------------------ | -------- | - // | nram_grad_input ping | deal_h | - // | nram_dispatch ping | deal_h | - // | nram_grad_input pong | deal_h | - // | nram_dispatch pong | deal_h | - const int max_nram_num = MAX_NRAM_SIZE / sizeof(T); - const int deal_h = max_nram_num / 4; - const int pingpong_num = 2 * deal_h; - T *nram_grad_input = (T *)nram_buffer; - T *nram_dispatch = nram_grad_input + deal_h; - int grad_input_addr_offset = sample_idx * hidden + hidden_data_offset; - T *base_grad_input_addr = (T *)grad_input + grad_input_addr_offset; - int dispatch_idx_offset = - (indices_value * capacity + location_value) * hidden; - T *base_dispatch_addr = - (T *)dispatch + dispatch_idx_offset + hidden_data_offset; - int repeat_h = hidden_seg_num / deal_h; - int rem_h = hidden_seg_num % deal_h; - lcs(base_grad_input_addr, base_dispatch_addr, nram_grad_input, nram_dispatch, - gates_si_value, repeat_h, rem_h, deal_h, pingpong_num); -#endif -} - -template -__mlu_entry__ void MLUKernelMoeDispatchBwdData2( - const T *gates, const int *indices, const int *locations, const T *dispatch, - const int samples, const int capacity, const int hidden, - const int num_experts, T *grad_input) { - // gates: (samples) - // indices: (samples) - // locations: (samples) - // dispatch: (num_experts * capacity, hidden) - // grad_input: (samples, hidden) -#if __BANG_ARCH__ >= 372 - if (__is_mpu()) { - return; - } - int per_task_sample_num = samples / taskDim; - int rem_sample_num = samples % taskDim; - int samples_num = per_task_sample_num + (int)((taskId < rem_sample_num)); - int sample_idx = taskId * per_task_sample_num + - ((taskId < rem_sample_num) ? taskId : rem_sample_num); - int max_deal_h = - (MAX_NRAM_SIZE - 4 * sizeof(int) - 1 * sizeof(T)) / 2 / sizeof(T); - int deal_h = 0; - int deal_s = 0; - if (hidden > max_deal_h) { - deal_s = 1; - deal_h = max_deal_h; - } else { - deal_h = hidden; - deal_s = (MAX_NRAM_SIZE - 2 * deal_h * sizeof(T)) / - (1 * sizeof(T) + 4 * sizeof(int)); - } - // | nram space partion | data num | - // | ------------------------ | -------- | - // | nram_gates | deal_s | - // | nram_dispatch_idx_offset | deal_s | - // | nram_mask | deal_s | - // | nram_indices | deal_s | - // | nram_locations | deal_s | - // | nram_grad_input | deal_h | - // | nram_dispatch | deal_h | - T *nram_gates = (T *)nram_buffer; - int *nram_dispatch_idx_offset = (int *)(nram_gates + deal_s); - int *nram_mask = nram_dispatch_idx_offset + deal_s; - int *nram_indices = nram_mask + deal_s; - int *nram_locations = nram_indices + deal_s; - T *nram_grad_input = (T *)(nram_locations + deal_s); - T *nram_dispatch = nram_grad_input + deal_h; - int repeat_s = samples_num / deal_s; - int rem_s = samples_num % deal_s; - int repeat_h = hidden / deal_h; - int rem_h = hidden % deal_h; - // get gdram input gates indices locations offset - T *base_gates = (T *)gates + sample_idx; - int *base_indices = (int *)indices + sample_idx; - int *base_locations = (int *)locations + sample_idx; - // get gdram output grad_input offset - int grad_input_offset = sample_idx * hidden; - T *base_grad_input = (T *)grad_input + grad_input_offset; - for (int s_iter = 0; s_iter <= repeat_s; ++s_iter) { - int deal_s_num = (s_iter == repeat_s) ? rem_s : deal_s; - if (deal_s_num == 0) { - break; - } - // load gates indices locations - T *base_gates_s = base_gates + s_iter * deal_s; - int *base_indices_s = base_indices + s_iter * deal_s; - int *base_locations_s = base_locations + s_iter * deal_s; - __memcpy(nram_gates, base_gates_s, deal_s_num * sizeof(T), GDRAM2NRAM); - __memcpy(nram_indices, base_indices_s, deal_s_num * sizeof(int), - GDRAM2NRAM); - __memcpy(nram_locations, base_locations_s, deal_s_num * sizeof(int), - GDRAM2NRAM); - // dispatch idx = (nram_indices * capacity + nram_locations) * hidden - __bang_mul_scalar(nram_dispatch_idx_offset, nram_indices, capacity, - deal_s_num); - __bang_add(nram_dispatch_idx_offset, nram_dispatch_idx_offset, - nram_locations, deal_s_num); - __bang_mul_scalar(nram_dispatch_idx_offset, nram_dispatch_idx_offset, - hidden, deal_s_num); - // 0 <= nram_locations < capacity - __bang_ge_scalar(nram_mask, nram_locations, (int)0, deal_s_num); - __bang_lt_scalar(nram_locations, nram_locations, capacity, deal_s_num); - __bang_and(nram_locations, nram_locations, nram_mask, deal_s_num); - // 0 <= nram_indices < num_experts - __bang_ge_scalar(nram_mask, nram_indices, (int)0, deal_s_num); - __bang_lt_scalar(nram_indices, nram_indices, num_experts, deal_s_num); - __bang_and(nram_indices, nram_indices, nram_mask, deal_s_num); - __bang_and(nram_mask, nram_indices, nram_locations, deal_s_num); - // get output grad_input s offset - T *base_grad_input_s = base_grad_input + s_iter * deal_s * hidden; - for (int si = 0; si < deal_s_num; ++si) { - if (nram_mask[si] != 1) { - continue; - } - T *base_dispatch_si = (T *)dispatch + nram_dispatch_idx_offset[si]; - T *base_grad_input_s_si = base_grad_input_s + si * hidden; - for (int h_iter = 0; h_iter <= repeat_h; ++h_iter) { - int deal_h_num = (h_iter == repeat_h) ? rem_h : deal_h; - if (deal_h_num == 0) { - break; - } - // get input dispatch h offset - T *base_dispatch_si_h = base_dispatch_si + h_iter * deal_h; - // get output grad_input s h offset - T *base_grad_input_s_si_h = base_grad_input_s_si + h_iter * deal_h; - __memcpy(nram_dispatch, base_dispatch_si_h, deal_h_num * sizeof(T), - GDRAM2NRAM); - __bang_mul_scalar(nram_grad_input, nram_dispatch, nram_gates[si], - deal_h_num); - // store grad_input - __memcpy(base_grad_input_s_si_h, nram_grad_input, - deal_h_num * sizeof(T), NRAM2GDRAM); - } // repeat h - } // repeat deal_s_num - } // repeat s -#endif -} - -mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchBwdData1( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t d_type, const void *gates, const void *indices, - const void *locations, const void *dispatch, const int samples, - const int capacity, const int hidden, const int num_experts, - void *grad_input) { - /* Only float data type is supported in host-side CPP file - fool-proof processing.*/ - KERNEL_CHECK(MLUKernelMoeDispatchBwdData1<<>>( - (float *)gates, (int *)indices, (int *)locations, (float *)dispatch, - samples, capacity, hidden, num_experts, (float *)grad_input)); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchBwdData2( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t d_type, const void *gates, const void *indices, - const void *locations, const void *dispatch, const int samples, - const int capacity, const int hidden, const int num_experts, - void *grad_input) { - /* Only float data type is supported in host-side CPP file - fool-proof processing.*/ - KERNEL_CHECK(MLUKernelMoeDispatchBwdData2<<>>( - (float *)gates, (int *)indices, (int *)locations, (float *)dispatch, - samples, capacity, hidden, num_experts, (float *)grad_input)); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/moe_dispatch_backward_gate/moe_dispatch_backward_gate.cpp b/kernels/moe_dispatch_backward_gate/moe_dispatch_backward_gate.cpp deleted file mode 100644 index cbcd6c21f..000000000 --- a/kernels/moe_dispatch_backward_gate/moe_dispatch_backward_gate.cpp +++ /dev/null @@ -1,260 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "moe_dispatch_backward_gate.h" - -#include - -#include "core/context.h" -#include "core/gen_case.h" -#include "core/logging.h" -#include "core/runtime/device.h" -#include "core/tensor.h" -#include "core/type.h" -#include "kernels/kernel.h" -#include "kernels/utils/cnnl_helper.h" - -static void policyFunc(const mluOpHandle_t handle, const int samples, - cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) { - int max_core_num = mluop::runtime::getCoreNumOfJobLimitCapability(handle); - k_dim->x = max_core_num; - k_dim->y = 1; - k_dim->z = 1; - if (samples > max_core_num) { - *k_type = CNRT_FUNC_TYPE_UNION1; - } else { - *k_type = mluop::runtime::getJobLimitCapabilityCnrtFuncType(handle); - } -} - -mluOpStatus_t MLUOP_WIN_API mluOpGetMoeDispatchBackwardGateWorkspaceSize( - mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, - size_t *workspace_size) { - PARAM_CHECK("[mluOpMoeDispatchBackwardGate]", handle != NULL); - // platform check - if (handle->arch < MLUOP_MLU370) { - LOG(ERROR) << "[mluOpMoeDispatchBackwardGate] Only mlu300 and above " - "devices are supported. " - << "Please check the device version!"; - return MLUOP_STATUS_ARCH_MISMATCH; - } - PARAM_CHECK("[mluOpMoeDispatchBackwardGate]", input_desc != NULL); - PARAM_CHECK("[mluOpMoeDispatchBackwardGate]", workspace_size != NULL); - - int samples = input_desc->dims[0]; - *workspace_size = 0; - cnrtDim3_t k_dim; - cnrtFunctionType_t k_type; - policyFunc(handle, samples, &k_dim, &k_type); - int taskNum = k_dim.x * k_dim.y * k_dim.z; - if ((samples > 0) && (samples < taskNum)) { - *workspace_size = taskNum * mluop::getSizeOfDataType(input_desc->dtype); - } - - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t moeDispatchBackwardGateParamCheck( - const std::string &op_name, const mluOpHandle_t handle, - const mluOpTensorDescriptor_t indices_desc, const void *indices, - const mluOpTensorDescriptor_t locations_desc, const void *locations, - const mluOpTensorDescriptor_t input_desc, const void *input, - const mluOpTensorDescriptor_t dispatch_desc, const void *dispatch, - const int samples, const int capacity, const int hidden, - const int num_experts, void *workspace, const size_t workspace_size, - const mluOpTensorDescriptor_t grad_gates_desc, const void *grad_gates, - bool *zero_element) { - // check descriptor and data - PARAM_CHECK(op_name, handle != NULL); - // platform check - if (handle->arch < MLUOP_MLU370) { - LOG(ERROR) << op_name << "Only mlu300 and above devices are supported. " - << "Please check the device version!"; - return MLUOP_STATUS_ARCH_MISMATCH; - } - - PARAM_CHECK(op_name, indices_desc != NULL); - PARAM_CHECK(op_name, locations_desc != NULL); - PARAM_CHECK(op_name, input_desc != NULL); - PARAM_CHECK(op_name, dispatch_desc != NULL); - PARAM_CHECK(op_name, grad_gates_desc != NULL); - - // check shape - PARAM_CHECK(op_name, indices_desc->dim == 1); - PARAM_CHECK(op_name, locations_desc->dim == 1); - PARAM_CHECK(op_name, input_desc->dim == 2); - PARAM_CHECK(op_name, dispatch_desc->dim == 2); - PARAM_CHECK(op_name, grad_gates_desc->dim == 1); - - // check data type - PARAM_CHECK_V2(op_name, (indices_desc->dtype == MLUOP_DTYPE_INT32), - "Only int32 are supported in indices tensor, but the data " - "type of tensor is " - << mluOpGetNameOfDataType(indices_desc->dtype) << "."); - PARAM_CHECK_V2(op_name, (locations_desc->dtype == MLUOP_DTYPE_INT32), - "Only int32 are supported in locations tensor, but the data " - "type of tensor is " - << mluOpGetNameOfDataType(locations_desc->dtype) << "."); - - // check tensor datatype, support float32 - PARAM_CHECK_V2(op_name, (input_desc->dtype == MLUOP_DTYPE_FLOAT), - "Only float are supported in input tensor, but the " - "data type of tensor is " - << mluOpGetNameOfDataType(input_desc->dtype) << "."); - PARAM_CHECK(op_name, input_desc->dtype == dispatch_desc->dtype); - PARAM_CHECK(op_name, input_desc->dtype == grad_gates_desc->dtype); - - // check dim - PARAM_CHECK(op_name, samples >= 0); - PARAM_CHECK(op_name, capacity >= 0); - PARAM_CHECK(op_name, hidden >= 0); - PARAM_CHECK(op_name, num_experts >= 0); - PARAM_CHECK(op_name, (samples == indices_desc->dims[0])); - PARAM_CHECK(op_name, (samples == locations_desc->dims[0])); - PARAM_CHECK(op_name, (samples == input_desc->dims[0])); - PARAM_CHECK(op_name, (samples == grad_gates_desc->dims[0])); - PARAM_CHECK(op_name, ((num_experts * capacity) == dispatch_desc->dims[0])); - PARAM_CHECK(op_name, (hidden == input_desc->dims[1])); - PARAM_CHECK(op_name, (hidden == dispatch_desc->dims[1])); - - const size_t indices_element_num = mluOpGetTensorElementNum(indices_desc); - const size_t locations_element_num = mluOpGetTensorElementNum(locations_desc); - const size_t input_element_num = mluOpGetTensorElementNum(input_desc); - const size_t dispatch_element_num = mluOpGetTensorElementNum(dispatch_desc); - const size_t grad_gates_element_num = - mluOpGetTensorElementNum(grad_gates_desc); - - // check large tensor - TENSOR_NUM_CHECK(op_name, indices_element_num, LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK(op_name, locations_element_num, LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK(op_name, input_element_num, LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK(op_name, dispatch_element_num, LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK(op_name, grad_gates_element_num, LARGE_TENSOR_NUM, ""); - - // check element num zero - if (indices_element_num == 0 || locations_element_num == 0 || - input_element_num == 0 || dispatch_element_num == 0 || - grad_gates_element_num == 0) { - *zero_element = true; - return MLUOP_STATUS_SUCCESS; - } - - // check workspace ptr - if (workspace_size > 0) { - PARAM_CHECK(op_name, workspace != NULL); - } - - // input and output ptr check null - PARAM_CHECK(op_name, indices != NULL); - PARAM_CHECK(op_name, locations != NULL); - PARAM_CHECK(op_name, input != NULL); - PARAM_CHECK(op_name, dispatch != NULL); - PARAM_CHECK(op_name, grad_gates != NULL); - - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpMoeDispatchBackwardGate( - mluOpHandle_t handle, const mluOpTensorDescriptor_t indices_desc, - const void *indices, const mluOpTensorDescriptor_t locations_desc, - const void *locations, const mluOpTensorDescriptor_t input_desc, - const void *input, const mluOpTensorDescriptor_t dispatch_desc, - const void *dispatch, const int samples, const int capacity, - const int hidden, const int num_experts, void *workspace, - const size_t workspace_size, const mluOpTensorDescriptor_t grad_gates_desc, - void *grad_gates) { - // check params - bool zero_element = false; - mluOpStatus_t param_check = moeDispatchBackwardGateParamCheck( - "[mluOpMoeDispatchBackwardGate]", handle, indices_desc, indices, - locations_desc, locations, input_desc, input, dispatch_desc, dispatch, - samples, capacity, hidden, num_experts, workspace, workspace_size, - grad_gates_desc, grad_gates, &zero_element); - if (param_check != MLUOP_STATUS_SUCCESS) { - return param_check; - } - - // check zero element - if (zero_element == true) { - VLOG(5) << "[mluOpMoeDispatchBackwardGate] Skip zero element tensor."; - if (samples > 0) { - VLOG(5) << "cnnlFill_v3 start."; - const size_t fill_value = 0x0; - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_gates_desc, - cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value, - cnnl_output_desc, grad_gates)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - VLOG(5) << "cnnlFill_v3 end."; - } - return MLUOP_STATUS_SUCCESS; - } - - if (MLUOP_GEN_CASE_ON_NEW) { - GEN_CASE_START("moe_dispatch_backward_gate"); - GEN_CASE_HANDLE(handle); - GEN_CASE_DATA_REAL(true, "indices", indices, indices_desc); - GEN_CASE_DATA_REAL(true, "locations", locations, locations_desc); - GEN_CASE_DATA(true, "input", input, input_desc, 0, 0); - GEN_CASE_DATA(true, "dispatch", dispatch, dispatch_desc, 0, 0); - GEN_CASE_DATA(false, "grad_gates", grad_gates, grad_gates_desc, 0, 0); - GEN_CASE_OP_PARAM_SINGLE(0, "moe_dispatch_backward_gate", "samples", - samples); - GEN_CASE_OP_PARAM_SINGLE(1, "moe_dispatch_backward_gate", "capacity", - capacity); - GEN_CASE_OP_PARAM_SINGLE(2, "moe_dispatch_backward_gate", "hidden", hidden); - GEN_CASE_OP_PARAM_SINGLE(3, "moe_dispatch_backward_gate", "num_experts", - num_experts); - GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0); - } - - cnrtDim3_t k_dim; - cnrtFunctionType_t k_type; - policyFunc(handle, samples, &k_dim, &k_type); - VLOG(5) << "Launch Kernel mluOpMoeDispatchBackwardGate<<>>"; - mluOpDataType_t data_type = input_desc->dtype; - uint32_t taskNum = k_dim.x * k_dim.y * k_dim.z; - if (samples <= taskNum) { - VLOG(5) << "[mluOpMoeDispatchBackwardGate] launch " - "KernelMoeDispatchBwdGate1"; - CHECK_RETURN("[mluOpMoeDispatchBackwardGate1]", - KernelMoeDispatchBwdGate1(k_dim, k_type, handle->queue, - data_type, indices, locations, input, - dispatch, samples, capacity, hidden, - num_experts, workspace, grad_gates)); - } else { - VLOG(5) << "[mluOpMoeDispatchBackwardGate] launch " - "KernelMoeDispatchBwdGate2"; - CHECK_RETURN( - "[mluOpMoeDispatchBackwardGate2]", - KernelMoeDispatchBwdGate2(k_dim, k_type, handle->queue, data_type, - indices, locations, input, dispatch, samples, - capacity, hidden, num_experts, grad_gates)); - } - - GEN_CASE_END(); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/moe_dispatch_backward_gate/moe_dispatch_backward_gate.h b/kernels/moe_dispatch_backward_gate/moe_dispatch_backward_gate.h deleted file mode 100644 index b3429884d..000000000 --- a/kernels/moe_dispatch_backward_gate/moe_dispatch_backward_gate.h +++ /dev/null @@ -1,42 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#ifndef KERNELS_MOE_DISPATCH_BACKWARD_GATE_MOE_DISPATCH_BACKWARD_GATE_H -#define KERNELS_MOE_DISPATCH_BACKWARD_GATE_MOE_DISPATCH_BACKWARD_GATE_H - -#include "mlu_op.h" - -mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchBwdGate1( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t d_type, const void *indices, const void *locations, - const void *input, const void *dispatch, const int samples, - const int capacity, const int hidden, const int num_experts, - void *workspace, void *grad_gates); - -mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchBwdGate2( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t d_type, const void *indices, const void *locations, - const void *input, const void *dispatch, const int samples, - const int capacity, const int hidden, const int num_experts, - void *grad_gates); - -#endif // KERNELS_MOE_DISPATCH_BACKWARD_GATE_MOE_DISPATCH_BACKWARD_GATE_H diff --git a/kernels/moe_dispatch_backward_gate/moe_dispatch_backward_gate_union1.mlu b/kernels/moe_dispatch_backward_gate/moe_dispatch_backward_gate_union1.mlu deleted file mode 100644 index 383c97d0a..000000000 --- a/kernels/moe_dispatch_backward_gate/moe_dispatch_backward_gate_union1.mlu +++ /dev/null @@ -1,387 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subh_iterect to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "moe_dispatch_backward_gate.h" - -#include "core/logging.h" -#include "kernels/debug.h" -#include "kernels/kernel.h" -#include "kernels/utils/common.h" - -__nram__ char nram_buffer[MAX_NRAM_SIZE]; - -#if __BANG_ARCH__ >= 372 -template -static __mlu_func__ void load(const T *input_addr, const T *dispatch_addr, - T *nram_input, T *nram_dispatch, - const int deal_num, const int pingpong_num, - const int pi) { - int offset = (pi % 2) * pingpong_num; - T *nram_input_p = nram_input + offset; - T *nram_dispatch_p = nram_dispatch + offset; - __memcpy_async(nram_input_p, input_addr, deal_num * sizeof(T), GDRAM2NRAM); - __memcpy_async(nram_dispatch_p, dispatch_addr, deal_num * sizeof(T), - GDRAM2NRAM); -} - -template -static __mlu_func__ void compute(T *nram_input, T *nram_dispatch, T *gard_gates, - const int deal_num, const int pingpong_num, - const int pi) { - int offset = (pi % 2) * pingpong_num; - T *nram_input_p = nram_input + offset; - T *nram_dispatch_p = nram_dispatch + offset; - __bang_mul(nram_input_p, nram_input_p, nram_dispatch_p, deal_num); - if (deal_num > 1) { - __bang_sumpool(nram_input_p, nram_input_p, 1, 1, deal_num, 1, deal_num, 1, - 1); - } - *gard_gates += nram_input_p[0]; -} - -template -static __mlu_func__ void lcs(T *base_input_addr, T *base_dispatch_addr, - T *nram_input, T *nram_dispatch, T *gard_gates, - const int repeat_num, const int rem_num, - const int deal_num, const int pingpong_num) { - if (repeat_num > 0) { - // L - T *input_addr = base_input_addr; - T *dispatch_addr = base_dispatch_addr; - load(input_addr, dispatch_addr, nram_input, nram_dispatch, deal_num, - pingpong_num, 0); - __sync(); - } - - if (repeat_num > 1) { - // L - T *input_addr = base_input_addr + deal_num; - T *dispatch_addr = base_dispatch_addr + deal_num; - load(input_addr, dispatch_addr, nram_input, nram_dispatch, deal_num, - pingpong_num, 1); - - // C - compute(nram_input, nram_dispatch, gard_gates, deal_num, pingpong_num, 0); - __sync(); - } - - for (int n_iter = 0; n_iter < repeat_num - 2; n_iter++) { - // L - T *input_addr = base_input_addr + (n_iter + 2) * deal_num; - T *dispatch_addr = base_dispatch_addr + (n_iter + 2) * deal_num; - load(input_addr, dispatch_addr, nram_input, nram_dispatch, deal_num, - pingpong_num, n_iter + 2); - - // C - compute(nram_input, nram_dispatch, gard_gates, deal_num, pingpong_num, - n_iter + 1); - __sync(); - } - - if (rem_num > 0) { - // L - T *input_addr = base_input_addr + repeat_num * deal_num; - T *dispatch_addr = base_dispatch_addr + repeat_num * deal_num; - load(input_addr, dispatch_addr, nram_input, nram_dispatch, rem_num, - pingpong_num, repeat_num); - } - if (repeat_num > 0) { - // C - compute(nram_input, nram_dispatch, gard_gates, deal_num, pingpong_num, - repeat_num - 1); - } - __sync(); - - if (rem_num > 0) { - // C - compute(nram_input, nram_dispatch, gard_gates, rem_num, pingpong_num, - repeat_num); - __sync(); - } -} -#endif - -template -__mlu_global__ void MLUKernelMoeDispatchBwdGate1( - const int *indices, const int *locations, const T *input, const T *dispatch, - const int samples, const int capacity, const int hidden, - const int num_experts, T *workspace, T *grad_gates) { -#if __BANG_ARCH__ >= 372 - if (__is_mpu()) { - return; - } - - int one_sample_task_num = taskDim / samples; - int rem_task = taskDim % samples; - int sample_idx = 0; - if ((rem_task > 0) && (taskId < (one_sample_task_num + 1) * rem_task)) { - sample_idx = (int)(taskId / (one_sample_task_num + 1)); - one_sample_task_num = one_sample_task_num + 1; - } else { - sample_idx = (int)((taskId - rem_task) / one_sample_task_num); - } - - int indice = indices[sample_idx]; - int location = locations[sample_idx]; - T gard_gates_temp = (T)0.0; - - if (location >= 0 && location < capacity && indice >= 0 && - indice < num_experts) { - int logic_tid = taskId % one_sample_task_num; - int hidden_per_task = hidden / one_sample_task_num; - int rem_hidden_num = hidden % one_sample_task_num; - int hidden_seg_num = hidden_per_task + (int)(logic_tid < rem_hidden_num); - int hidden_data_offset = - logic_tid * hidden_per_task + - ((logic_tid < rem_hidden_num) ? logic_tid : rem_hidden_num); - - if (hidden_seg_num > 0) { - // nram space - // ping/pong: |nram_input|nram_dispatch| - int max_nram_num = MAX_NRAM_SIZE / sizeof(T); - int deal_h = max_nram_num / 4; - int pingpong_num = 2 * deal_h; - - T *nram_input = (T *)nram_buffer; - T *nram_dispatch = nram_input + deal_h; - - int input_addr_offset = sample_idx * hidden + hidden_data_offset; - T *base_input_addr = (T *)input + input_addr_offset; - int idx = (indice * capacity + location) * hidden; - T *base_dispatch_addr = (T *)dispatch + idx + hidden_data_offset; - - int repeat_h = hidden_seg_num / deal_h; - int rem_h = hidden_seg_num % deal_h; - lcs(base_input_addr, base_dispatch_addr, nram_input, nram_dispatch, - &gard_gates_temp, repeat_h, rem_h, deal_h, pingpong_num); - } - } - - if (samples == taskDim) { - grad_gates[sample_idx] = gard_gates_temp; - return; - } else { - workspace[taskId] = gard_gates_temp; - } - __sync_all_ipu(); - - if ((samples < taskDim) && (taskId == 0)) { - T *nram_grad_gates = (T *)nram_buffer; - __bang_write_zero(nram_grad_gates, samples); - - if (samples > 1) { - int one_sample_task_num = taskDim / samples; - int rem_task = taskDim % samples; - int sample_idx = 0; - for (int ti = 0; ti < taskDim; ti++) { - if ((rem_task > 0) && (ti < (one_sample_task_num + 1) * rem_task)) { - sample_idx = (int)(ti / (one_sample_task_num + 1)); - } else { - sample_idx = (int)((ti - rem_task) / one_sample_task_num); - } - nram_grad_gates[sample_idx] += workspace[ti]; - } - } else { - __memcpy(nram_grad_gates, workspace, taskDim * sizeof(T), GDRAM2NRAM); - __bang_sumpool(nram_grad_gates, nram_grad_gates, 1, 1, taskDim, 1, - taskDim, 1, 1); - } - // store - __memcpy(grad_gates, nram_grad_gates, samples * sizeof(T), NRAM2GDRAM); - } -#endif -} - -template -__mlu_global__ void MLUKernelMoeDispatchBwdGate2( - const int *indices, const int *locations, const T *input, const T *dispatch, - const int samples, const int capacity, const int hidden, - const int num_experts, T *grad_gates) { -#if __BANG_ARCH__ >= 372 - if (__is_mpu()) { - return; - } - int per_task_sample_num = samples / taskDim; - int rem_sample_num = samples % taskDim; - int samples_num = per_task_sample_num + (int)((taskId < rem_sample_num)); - int sample_idx = taskId * per_task_sample_num + - ((taskId < rem_sample_num) ? taskId : rem_sample_num); - // nram space - // |nram_indices|nram_location|nram_idx|nram_mask| - // ping/pong:|nram_input|nram_dispatch| - int max_deal_h = (MAX_NRAM_SIZE - 4 * sizeof(int)) / (4 * sizeof(T)); - int pingpong_num = 0; - int deal_h = 0; - int deal_s = 0; - if (hidden > max_deal_h) { - deal_s = 1; - deal_h = max_deal_h; - } else { - deal_h = hidden; - deal_s = (MAX_NRAM_SIZE - 4 * deal_h * sizeof(T)) / (4 * sizeof(int)); - } - - int *nram_indices = (int *)nram_buffer; - int *nram_location = nram_indices + deal_s; - int *nram_idx = nram_location + deal_s; - int *nram_mask = nram_idx + deal_s; - // ping/pong - pingpong_num = 2 * deal_h; - T *nram_input = (T *)(nram_mask + deal_s); - T *nram_dispatch = nram_input + deal_h; - - int repeat_s = samples_num / deal_s; - int rem_s = samples_num % deal_s; - int repeat_h = hidden / deal_h; - int rem_h = hidden % deal_h; - - int *base_indices = (int *)indices + sample_idx; - int *base_locations = (int *)locations + sample_idx; - int input_addr_offset = sample_idx * hidden; - T *base_input = (T *)input + input_addr_offset; - T *base_grad_gates = (T *)grad_gates + sample_idx; - - for (int s_iter = 0; s_iter < repeat_s + 1; s_iter++) { - int deal_s_num = (s_iter < repeat_s) ? deal_s : rem_s; - if (deal_s_num == 0) { - break; - } - - T *base_input_addr = base_input + s_iter * deal_s * hidden; - int *indices_addr = base_indices + s_iter * deal_s; - int *locations_addr = base_locations + s_iter * deal_s; - __memcpy(nram_indices, indices_addr, deal_s_num * sizeof(int), GDRAM2NRAM); - __memcpy(nram_location, locations_addr, deal_s_num * sizeof(int), - GDRAM2NRAM); - - // idx = (nram_indices * capacity + nram_location) * hidden - __bang_mul_scalar(nram_idx, nram_indices, capacity, deal_s_num); - __bang_add(nram_idx, nram_idx, nram_location, deal_s_num); - __bang_mul_scalar(nram_idx, nram_idx, hidden, deal_s_num); - - // 0 <= nram_location < capacity - __bang_ge_scalar(nram_mask, nram_location, (int)0, deal_s_num); - __bang_lt_scalar(nram_location, nram_location, capacity, deal_s_num); - __bang_and(nram_mask, nram_mask, nram_location, deal_s_num); - - // 0 <= nram_indices < num_experts - __bang_ge_scalar(nram_location, nram_indices, (int)0, deal_s_num); - __bang_lt_scalar(nram_indices, nram_indices, num_experts, deal_s_num); - __bang_and(nram_mask, nram_mask, nram_location, deal_s_num); - __bang_and(nram_mask, nram_mask, nram_indices, deal_s_num); - - T *nram_grad_gates = (T *)nram_indices; - __bang_write_zero(nram_grad_gates, deal_s_num); - - if (deal_s_num > 1) { - T *base_dispatch_addr = (T *)dispatch; - - // L(si=0) - if (nram_mask[0] == 1) { - T *input_addr = base_input_addr; - T *dispatch_addr = base_dispatch_addr + nram_idx[0]; - load(input_addr, dispatch_addr, nram_input, nram_dispatch, deal_h, - pingpong_num, 0); - __sync(); - } - - // L(si=1) - if (nram_mask[1] == 1) { - T *input_addr = base_input_addr + hidden; - T *dispatch_addr = base_dispatch_addr + nram_idx[1]; - load(input_addr, dispatch_addr, nram_input, nram_dispatch, deal_h, - pingpong_num, 1); - } - - // C(si=0) - if (nram_mask[0] == 1) { - compute(nram_input, nram_dispatch, nram_grad_gates, deal_h, - pingpong_num, 0); - } - __sync(); - - for (int si = 0; si < deal_s_num - 2; si++) { - // L(si+2) - if (nram_mask[si + 2] == 1) { - T *input_addr = base_input_addr + (si + 2) * hidden; - T *dispatch_addr = base_dispatch_addr + nram_idx[si + 2]; - load(input_addr, dispatch_addr, nram_input, nram_dispatch, deal_h, - pingpong_num, si + 2); - } - - // C(si+1) - if (nram_mask[si + 1] == 1) { - compute(nram_input, nram_dispatch, nram_grad_gates + (si + 1), deal_h, - pingpong_num, si + 1); - } - __sync(); - } - - // C(si=deal_s_num - 1) - if (nram_mask[deal_s_num - 1] == 1) { - compute(nram_input, nram_dispatch, nram_grad_gates + (deal_s_num - 1), - deal_h, pingpong_num, deal_s_num - 1); - __sync(); - } - } else { - // si = sample_idx + s_iter - if (nram_mask[0] == 1) { - T *base_dispatch_addr = (T *)dispatch + nram_idx[0]; - lcs(base_input_addr, base_dispatch_addr, nram_input, nram_dispatch, - nram_grad_gates, repeat_h, rem_h, deal_h, pingpong_num); - } - } - // store: - __memcpy(base_grad_gates + s_iter * deal_s, nram_grad_gates, - deal_s_num * sizeof(T), NRAM2GDRAM); - } -#endif -} - -mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchBwdGate1( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t d_type, const void *indices, const void *locations, - const void *input, const void *dispatch, const int samples, - const int capacity, const int hidden, const int num_experts, - void *workspace, void *grad_gates) { - /* Only float data type is supported in host-side CPP file - fool-proof processing.*/ - KERNEL_CHECK(MLUKernelMoeDispatchBwdGate1<<>>( - (int *)indices, (int *)locations, (float *)input, (float *)dispatch, - samples, capacity, hidden, num_experts, (float *)workspace, - (float *)grad_gates)); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchBwdGate2( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t d_type, const void *indices, const void *locations, - const void *input, const void *dispatch, const int samples, - const int capacity, const int hidden, const int num_experts, - void *grad_gates) { - /* Only float data type is supported in host-side CPP file - fool-proof processing.*/ - KERNEL_CHECK(MLUKernelMoeDispatchBwdGate2<<>>( - (int *)indices, (int *)locations, (float *)input, (float *)dispatch, - samples, capacity, hidden, num_experts, (float *)grad_gates)); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/moe_dispatch_forward/moe_dispatch_forward.cpp b/kernels/moe_dispatch_forward/moe_dispatch_forward.cpp deleted file mode 100644 index 1ea54e088..000000000 --- a/kernels/moe_dispatch_forward/moe_dispatch_forward.cpp +++ /dev/null @@ -1,200 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "moe_dispatch_forward.h" - -#include - -#include "core/context.h" -#include "core/gen_case.h" -#include "core/logging.h" -#include "core/runtime/device.h" -#include "core/tensor.h" -#include "core/type.h" - -static void policyFunc(const mluOpHandle_t handle, cnrtDim3_t *k_dim, - cnrtFunctionType_t *k_type) { - // block policy func - *k_type = CNRT_FUNC_TYPE_BLOCK; - // dimx equals to num of mlu cores in each cluster - k_dim->x = mluop::runtime::getCoreNumOfEachUnionCapability(handle); - // dimy equals to num of current available clusters - k_dim->y = mluop::runtime::getClusterLimitCapability(handle); - k_dim->z = 1; -} - -static mluOpStatus_t MoeDispatchForwardParamCheck( - const std::string &op_name, const mluOpHandle_t handle, - const mluOpTensorDescriptor_t gates_desc, const void *gates, - const mluOpTensorDescriptor_t indices_desc, const void *indices, - const mluOpTensorDescriptor_t locations_desc, const void *locations, - const mluOpTensorDescriptor_t input_desc, const void *input, - const int samples, const int capacity, const int hidden, - const int num_experts, const mluOpTensorDescriptor_t dispatch_desc, - void *dispatch, bool *zero_element) { - // check descriptor and data - PARAM_CHECK(op_name, handle != NULL); - // platform check - if (handle->arch < MLUOP_MLU370) { - LOG(ERROR) << op_name << "Only mlu300 and above devices are supported." - << "Please check the device version!"; - return MLUOP_STATUS_ARCH_MISMATCH; - } - - PARAM_CHECK(op_name, gates_desc != NULL); - PARAM_CHECK(op_name, indices_desc != NULL); - PARAM_CHECK(op_name, locations_desc != NULL); - PARAM_CHECK(op_name, input_desc != NULL); - PARAM_CHECK(op_name, dispatch_desc != NULL); - - // check shape - PARAM_CHECK(op_name, gates_desc->dim == 1); - PARAM_CHECK(op_name, indices_desc->dim == 1); - PARAM_CHECK(op_name, locations_desc->dim == 1); - PARAM_CHECK(op_name, input_desc->dim == 2); - PARAM_CHECK(op_name, dispatch_desc->dim == 2); - - // check data type - PARAM_CHECK_V2(op_name, (indices_desc->dtype == MLUOP_DTYPE_INT32), - "Only int32 are supported in indices tensor, but the data " - "type of tensor is " - << mluOpGetNameOfDataType(indices_desc->dtype) << "."); - PARAM_CHECK_V2(op_name, (locations_desc->dtype == MLUOP_DTYPE_INT32), - "Only int32 are supported in locations tensor, but the data " - "type of tensor is " - << mluOpGetNameOfDataType(locations_desc->dtype) << "."); - - // check tensor datatype, support float32 - PARAM_CHECK_V2(op_name, input_desc->dtype == MLUOP_DTYPE_FLOAT, - "Only float are supported in input tensor, but the " - "data type of tensor is " - << mluOpGetNameOfDataType(input_desc->dtype) << "."); - PARAM_CHECK(op_name, input_desc->dtype == dispatch_desc->dtype); - PARAM_CHECK(op_name, input_desc->dtype == gates_desc->dtype); - - // check dim - PARAM_CHECK(op_name, samples >= 0); - PARAM_CHECK(op_name, capacity >= 0); - PARAM_CHECK(op_name, hidden >= 0); - PARAM_CHECK(op_name, num_experts >= 0); - PARAM_CHECK(op_name, (samples == gates_desc->dims[0])); - PARAM_CHECK(op_name, (samples == indices_desc->dims[0])); - PARAM_CHECK(op_name, (samples == locations_desc->dims[0])); - PARAM_CHECK(op_name, (samples == input_desc->dims[0])); - PARAM_CHECK(op_name, ((num_experts * capacity) == dispatch_desc->dims[0])); - PARAM_CHECK(op_name, (hidden == input_desc->dims[1])); - PARAM_CHECK(op_name, (hidden == dispatch_desc->dims[1])); - - // check correlation of parameters - PARAM_CHECK_V2(op_name, samples <= (num_experts * capacity), - "The samples must be less than or equal to the " - "multiplication result of the capacity and num_experts"); - - const size_t indices_element_num = mluOpGetTensorElementNum(indices_desc); - const size_t locations_element_num = mluOpGetTensorElementNum(locations_desc); - const size_t input_element_num = mluOpGetTensorElementNum(input_desc); - const size_t dispatch_element_num = mluOpGetTensorElementNum(dispatch_desc); - const size_t gates_element_num = mluOpGetTensorElementNum(gates_desc); - - // check large tensor - TENSOR_NUM_CHECK(op_name, indices_element_num, LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK(op_name, locations_element_num, LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK(op_name, input_element_num, LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK(op_name, dispatch_element_num, LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK(op_name, gates_element_num, LARGE_TENSOR_NUM, ""); - - // check element num zero - if (indices_element_num == 0 || locations_element_num == 0 || - input_element_num == 0 || dispatch_element_num == 0 || - gates_element_num == 0) { - *zero_element = true; - return MLUOP_STATUS_SUCCESS; - } - - // input and output ptr check null - PARAM_CHECK(op_name, indices != NULL); - PARAM_CHECK(op_name, locations != NULL); - PARAM_CHECK(op_name, input != NULL); - PARAM_CHECK(op_name, dispatch != NULL); - PARAM_CHECK(op_name, gates != NULL); - - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpMoeDispatchForward( - mluOpHandle_t handle, const mluOpTensorDescriptor_t gates_desc, - const void *gates, const mluOpTensorDescriptor_t indices_desc, - const void *indices, const mluOpTensorDescriptor_t locations_desc, - const void *locations, const mluOpTensorDescriptor_t input_desc, - const void *input, const int samples, const int capacity, const int hidden, - const int num_experts, const mluOpTensorDescriptor_t dispatch_desc, - void *dispatch) { - // check params - bool zero_element = false; - mluOpStatus_t param_check = MoeDispatchForwardParamCheck( - "[mluOpMoeDispatchForward]", handle, gates_desc, gates, indices_desc, - indices, locations_desc, locations, input_desc, input, samples, capacity, - hidden, num_experts, dispatch_desc, dispatch, &zero_element); - if (param_check != MLUOP_STATUS_SUCCESS) { - return param_check; - } - - // check zero element - if (zero_element == true) { - VLOG(5) << "[mluOpMoeDispatchForward] Skip zero element tensor."; - return MLUOP_STATUS_SUCCESS; - } - - if (MLUOP_GEN_CASE_ON_NEW) { - GEN_CASE_START("moe_dispatch_forward"); - GEN_CASE_HANDLE(handle); - GEN_CASE_DATA(true, "gates", gates, gates_desc, 0, 1); - GEN_CASE_DATA_REAL(true, "indices", indices, indices_desc); - GEN_CASE_DATA_REAL(true, "locations", locations, locations_desc); - GEN_CASE_DATA(true, "input", input, input_desc, -100, 100); - GEN_CASE_DATA(true, "dispatch", dispatch, dispatch_desc, -100, 100); - GEN_CASE_OP_PARAM_SINGLE(0, "moe_dispatch_forward", "samples", samples); - GEN_CASE_OP_PARAM_SINGLE(1, "moe_dispatch_forward", "capacity", capacity); - GEN_CASE_OP_PARAM_SINGLE(2, "moe_dispatch_forward", "hidden", hidden); - GEN_CASE_OP_PARAM_SINGLE(3, "moe_dispatch_forward", "num_experts", - num_experts); - GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0); - } - - cnrtDim3_t k_dim; - cnrtFunctionType_t k_type; - policyFunc(handle, &k_dim, &k_type); - VLOG(5) << "Launch kernel mluOpMoeDispatchForward<<>>"; - - mluOpDataType_t data_type = input_desc->dtype; - VLOG(5) << "[mluOpMoeDispatchForward] launch " - "KernelMoeDispatchForward"; - CHECK_RETURN( - "[mluOpMoeDispatchForward]", - KernelMoeDispatchForward(k_dim, k_type, handle->queue, data_type, gates, - indices, locations, input, samples, capacity, - hidden, num_experts, dispatch)); - - GEN_CASE_END(); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/moe_dispatch_forward/moe_dispatch_forward.h b/kernels/moe_dispatch_forward/moe_dispatch_forward.h deleted file mode 100644 index b8bba1971..000000000 --- a/kernels/moe_dispatch_forward/moe_dispatch_forward.h +++ /dev/null @@ -1,35 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#ifndef KERNELS_MOE_DISPATCH_FORWARD_MOE_DISPATCH_FORWARD_H -#define KERNELS_MOE_DISPATCH_FORWARD_MOE_DISPATCH_FORWARD_H - -#include "mlu_op.h" - -mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchForward( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t d_type, const void *gates, const void *indices, - const void *locations, const void *input, const int samples, - const int capacity, const int hidden, const int num_experts, - void *dispatch); - -#endif // KERNELS_MOE_DISPATCH_FORWARD_MOE_DISPATCH_FORWARD_H diff --git a/kernels/moe_dispatch_forward/moe_dispatch_forward_block.mlu b/kernels/moe_dispatch_forward/moe_dispatch_forward_block.mlu deleted file mode 100644 index 8e3cb502b..000000000 --- a/kernels/moe_dispatch_forward/moe_dispatch_forward_block.mlu +++ /dev/null @@ -1,155 +0,0 @@ - -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subh_iterect to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "moe_dispatch_forward.h" - -#include "core/logging.h" -#include "kernels/debug.h" -#include "kernels/kernel.h" -#include "kernels/utils/common.h" - -__nram__ char nram_buffer[MAX_NRAM_SIZE]; - -template -__mlu_global__ void MLUKernelMoeDispatchFwd( - const T *gates, const int *indices, const int *locations, const T *input, - const int samples, const int capacity, const int hidden, - const int num_experts, T *dispatch) { -#if __BANG_ARCH__ >= 372 - if (__is_mpu()) { - return; - } - - int max_deal_h = (MAX_NRAM_SIZE - 4 * sizeof(int) - sizeof(T)) / (sizeof(T)); - int hidden_per_task = hidden / taskDim; - int hidden_rem = hidden % taskDim; - hidden_per_task += (taskId < hidden_rem) ? 1 : 0; - int deal_h = 0; - int deal_s = 0; - if (hidden_per_task > max_deal_h) { - deal_h = max_deal_h; - deal_s = 1; - } else { - deal_h = hidden_per_task; - deal_s = - (MAX_NRAM_SIZE - deal_h * sizeof(T)) / (4 * sizeof(int) + sizeof(T)); - deal_s = deal_s < samples ? deal_s : samples; - } - - // | nram space partion | data num | - // | ------------------------ | -------- | - // | nram_input | deal_h | - // | nram_gates | deal_s | - // | nram_indices | deal_s | - // | nram_location | deal_s | - // | nram_idx | deal_s | - // | nram_mask | deal_s | - - T *nram_input = (T *)nram_buffer; - T *nram_gates = nram_input + deal_h; - int *nram_indices = (int *)nram_gates + deal_s; - int *nram_locations = nram_indices + deal_s; - int *nram_idx = nram_locations + deal_s; - int *nram_mask = nram_idx + deal_s; - - int repeat_s = samples / deal_s; - int rem_s = samples % deal_s; - int repeat_h = hidden_per_task / deal_h; - int rem_h = hidden_per_task % deal_h; - - for (int s_iter = 0; s_iter <= repeat_s; ++s_iter) { - int deal_s_num = (s_iter == repeat_s) ? rem_s : deal_s; - if (deal_s_num == 0) { - break; - } - - // load gates indices locations - T *base_gates = (T *)gates + s_iter * deal_s_num; - int *base_indices = (int *)indices + s_iter * deal_s_num; - int *base_locations = (int *)locations + s_iter * deal_s_num; - - __memcpy(nram_gates, base_gates, deal_s_num * sizeof(T), GDRAM2NRAM); - __memcpy(nram_indices, base_indices, deal_s_num * sizeof(int), GDRAM2NRAM); - __memcpy(nram_locations, base_locations, deal_s_num * sizeof(int), - GDRAM2NRAM); - - // compute dispatch idx = (nram_indices * capacity + nram_locations) - __bang_mul_scalar(nram_idx, nram_indices, capacity, deal_s_num); - __bang_add(nram_idx, nram_idx, nram_locations, deal_s_num); - - // 0 <= nram_locations < capacity - __bang_ge_scalar(nram_mask, nram_locations, (int)0, deal_s_num); - __bang_lt_scalar(nram_locations, nram_locations, capacity, deal_s_num); - __bang_and(nram_locations, nram_locations, nram_mask, deal_s_num); - - // 0 <= nram_indices < num_experts - __bang_ge_scalar(nram_mask, nram_indices, (int)0, deal_s_num); - __bang_lt_scalar(nram_indices, nram_indices, num_experts, deal_s_num); - __bang_and(nram_indices, nram_indices, nram_mask, deal_s_num); - __bang_and(nram_mask, nram_indices, nram_locations, deal_s_num); - - T *base_input = (T *)input + s_iter * deal_s_num * hidden; - for (int ds_iter = 0; ds_iter < deal_s_num; ++ds_iter) { - if (nram_mask[ds_iter] == 1) { - T *base_input_s = base_input + ds_iter * hidden; - T *base_dispatch_s = dispatch + nram_idx[ds_iter] * hidden; - - for (int h_iter = 0; h_iter <= repeat_h; ++h_iter) { - int deal_h_num = (h_iter == repeat_h) ? rem_h : deal_h; - if (deal_h_num == 0) { - break; - } - int input_rem_num = (taskId < hidden_rem ? taskId : hidden_rem); - int input_offset = (hidden / taskDim) * taskId + input_rem_num; - T *base_input_h = base_input_s + input_offset + h_iter * deal_h; - T *base_dispatch_h = base_dispatch_s + input_offset + h_iter * deal_h; - __memcpy(nram_input, base_input_h, deal_h_num * sizeof(T), - GDRAM2NRAM); - - // dispatch = input * gates - __bang_mul_scalar(nram_input, nram_input, nram_gates[ds_iter], - deal_h_num); - - // store dispatch to GDRAM - __memcpy(base_dispatch_h, nram_input, deal_h_num * sizeof(T), - NRAM2GDRAM); - } // repeat h - } - } // deal s - } // repeat s -#endif -} - -mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchForward( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t d_type, const void *gates, const void *indices, - const void *locations, const void *input, const int samples, - const int capacity, const int hidden, const int num_experts, - void *dispatch) { - /* Only float data type is supported in host-side CPP file - fool-proof processing.*/ - KERNEL_CHECK(MLUKernelMoeDispatchFwd<<>>( - (float *)gates, (int *)indices, (int *)locations, (float *)input, samples, - capacity, hidden, num_experts, (float *)dispatch)); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/ms_deform_attn_backward/ms_deform_attn_backward.cpp b/kernels/ms_deform_attn_backward/ms_deform_attn_backward.cpp deleted file mode 100644 index 0bb07a68a..000000000 --- a/kernels/ms_deform_attn_backward/ms_deform_attn_backward.cpp +++ /dev/null @@ -1,446 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "ms_deform_attn_backward.h" - -#include "core/context.h" -#include "core/gen_case.h" -#include "core/logging.h" -#include "core/runtime/device.h" -#include "core/tensor.h" -#include "core/tool.h" -#include "core/type.h" -#include "kernels/debug.h" -#include "kernels/kernel.h" -#include "kernels/utils/cnnl_helper.h" - -char API[] = "[mluOpMsDeformAttnBackward]"; - -#define MIN(a, b) (((a) < (b)) ? (a) : (b)) - -/*! - * @brief Describes the kernel policy of ms_deform_attn_backward. - */ -typedef enum { - MLUOP_MS_DEFORM_ATTN_BACKWARD_DEFAULT = 0, - /*!< Returns the default policy. */ - MLUOP_MS_DEFORM_ATTN_BACKWARD_SMALL_CHANNEL = 1, - /*!< Returns the small channel policy. */ - MLUOP_MS_DEFORM_ATTN_BACKWARD_FAST = 2, - /*!< Returns the fast policy. */ -} mluOpDeformAttnBackwardKernelPolicy_t; - -static void policyFunc(mluOpHandle_t handle, const int32_t batch, - const int32_t num_query, const int32_t num_heads, - const int32_t num_levels, cnrtFunctionType_t *k_type, - cnrtDim3_t *k_dim, - mluOpDeformAttnBackwardKernelPolicy_t kernelPolicy) { - size_t cluster_limit = mluop::runtime::getClusterLimitCapability(handle); - size_t core_limit = mluop::runtime::getCoreNumOfEachUnionCapability(handle); - k_dim->x = core_limit; - int32_t total_num = batch * num_query * num_heads * num_levels; - if (kernelPolicy == MLUOP_MS_DEFORM_ATTN_BACKWARD_SMALL_CHANNEL) { - total_num = batch * num_query; - } - size_t total_num_align = CEIL_ALIGN(total_num, core_limit); - k_dim->y = (total_num_align / core_limit) > cluster_limit - ? cluster_limit - : (total_num_align / core_limit); - k_dim->z = 1; - *k_type = CNRT_FUNC_TYPE_UNION1; -} - -mluOpDeformAttnBackwardKernelPolicy_t msDeformAttnBackwardPolicyFunc( - const mluOpHandle_t handle, const int channels, const int num_levels, - const int num_points, const int num_heads) { - const int num_hlp = num_heads * num_levels * num_points; - int num_per_time_theory = (MAX_NRAM_SIZE - num_levels * sizeof(float) - - 3 * num_levels * sizeof(int32_t)) / - sizeof(float) / (8 * PAD_UP(channels, 32) + 28) / - PAD_UP((num_hlp), 32); - int32_t nlp = num_levels * num_points; - int32_t nlpc = num_levels * num_points * channels; - - if ((handle->arch == MLUOP_MLU590) && (nlp <= FAST_KERNEL_MAX_NLP) && - (nlpc <= FAST_KERNEL_MAX_NLPC)) { - return MLUOP_MS_DEFORM_ATTN_BACKWARD_FAST; - } else if (num_per_time_theory >= 1) { - return MLUOP_MS_DEFORM_ATTN_BACKWARD_SMALL_CHANNEL; - } - return MLUOP_MS_DEFORM_ATTN_BACKWARD_DEFAULT; -} - -/* check user entrance param in mluOpMsDeformAttnBackward */ -static mluOpStatus_t msDeformAttnBackwardParamCheck( - mluOpHandle_t handle, const mluOpTensorDescriptor_t value_desc, - const void *value, const mluOpTensorDescriptor_t spatial_shapes_desc, - const void *spatial_shapes, - const mluOpTensorDescriptor_t level_start_index_desc, - const void *level_start_index, - const mluOpTensorDescriptor_t sampling_loc_desc, const void *sampling_loc, - const mluOpTensorDescriptor_t attn_weight_desc, const void *attn_weight, - const mluOpTensorDescriptor_t grad_output_desc, const void *grad_output, - const int32_t im2col_step, const mluOpTensorDescriptor_t grad_value_desc, - void *grad_value, const mluOpTensorDescriptor_t grad_sampling_loc_desc, - void *grad_sampling_loc, - const mluOpTensorDescriptor_t grad_attn_weight_desc, void *grad_attn_weight, - bool *calc_grad_loc_weight_flag, bool *calc_grad_value_flag, - bool *calc_grad_value_loc_weight_flag) { - // check desc - PARAM_CHECK(API, handle != NULL); - PARAM_CHECK(API, value_desc != NULL); - PARAM_CHECK(API, spatial_shapes_desc != NULL); - PARAM_CHECK(API, level_start_index_desc != NULL); - PARAM_CHECK(API, sampling_loc_desc != NULL); - PARAM_CHECK(API, attn_weight_desc != NULL); - PARAM_CHECK(API, grad_output_desc != NULL); - PARAM_CHECK(API, grad_value_desc != NULL); - PARAM_CHECK(API, grad_sampling_loc_desc != NULL); - PARAM_CHECK(API, grad_attn_weight_desc != NULL); - - // check dim - PARAM_CHECK(API, value_desc->dim == 4); - PARAM_CHECK(API, spatial_shapes_desc->dim == 2); - PARAM_CHECK(API, level_start_index_desc->dim == 1); - PARAM_CHECK(API, sampling_loc_desc->dim == 6); - PARAM_CHECK(API, attn_weight_desc->dim == 5); - PARAM_CHECK(API, grad_output_desc->dim == 4); - PARAM_CHECK(API, grad_value_desc->dim == 4); - PARAM_CHECK(API, grad_sampling_loc_desc->dim == 6); - PARAM_CHECK(API, grad_attn_weight_desc->dim == 5); - - // check datatype - PARAM_CHECK(API, (value_desc->dtype == MLUOP_DTYPE_FLOAT && - spatial_shapes_desc->dtype == MLUOP_DTYPE_INT32 && - level_start_index_desc->dtype == MLUOP_DTYPE_INT32 && - sampling_loc_desc->dtype == MLUOP_DTYPE_FLOAT && - attn_weight_desc->dtype == MLUOP_DTYPE_FLOAT && - grad_output_desc->dtype == MLUOP_DTYPE_FLOAT && - grad_value_desc->dtype == MLUOP_DTYPE_FLOAT && - grad_sampling_loc_desc->dtype == MLUOP_DTYPE_FLOAT && - grad_attn_weight_desc->dtype == MLUOP_DTYPE_FLOAT)); - - const int32_t num_key = value_desc->dims[1]; - const int32_t channels = value_desc->dims[3]; - const int32_t batch = attn_weight_desc->dims[0]; - const int32_t num_query = attn_weight_desc->dims[1]; - const int32_t num_heads = attn_weight_desc->dims[2]; - const int32_t num_levels = attn_weight_desc->dims[3]; - const int32_t num_points = attn_weight_desc->dims[4]; - // check input param - const int32_t im2col_step_ = MIN(batch, im2col_step); - PARAM_CHECK(API, im2col_step_ > 0); - PARAM_CHECK(API, batch % im2col_step_ == 0); - - // check all the input relationship - for (int32_t i = 0; i < value_desc->dim; ++i) { - if (value_desc->dims[i] != grad_value_desc->dims[i]) { - LOG(ERROR) << "[mluOpMsDeformAttnBackward] The shape of value should be " - "the same as grad_value." - << " But now value_desc->dims[" << i << "] is " - << value_desc->dims[i] << ", and grad_value_desc->dims[" << i - << "] is " << grad_value_desc->dims[i] << "."; - return MLUOP_STATUS_BAD_PARAM; - } - } - for (int32_t i = 0; i < sampling_loc_desc->dim; ++i) { - if (sampling_loc_desc->dims[i] != grad_sampling_loc_desc->dims[i]) { - LOG(ERROR) << "[mluOpMsDeformAttnBackward] The shape of " - "sampling_loc_desc should be the " - "same as grad_sampling_loc_desc." - << " But now sampling_loc_desc->dims[" << i << "] is " - << sampling_loc_desc->dims[i] - << ", and grad_sampling_loc_desc->dims[" << i << "] is " - << grad_sampling_loc_desc->dims[i] << "."; - return MLUOP_STATUS_BAD_PARAM; - } - } - for (int32_t i = 0; i < attn_weight_desc->dim; ++i) { - if (attn_weight_desc->dims[i] != grad_attn_weight_desc->dims[i]) { - LOG(ERROR) << "[mluOpMsDeformAttnBackward] The shape of " - "attn_weight_desc should be the " - "same as grad_attn_weight_desc." - << " But now attn_weight_desc->dims[" << i << "] is " - << attn_weight_desc->dims[i] - << ", and grad_attn_weight_desc->dims[" << i << "] is " - << grad_attn_weight_desc->dims[i] << "."; - return MLUOP_STATUS_BAD_PARAM; - } - } - PARAM_CHECK_EQ(API, value_desc->dims[0], batch); - PARAM_CHECK_EQ(API, value_desc->dims[2], num_heads); - - PARAM_CHECK_EQ(API, spatial_shapes_desc->dims[0], num_levels); - PARAM_CHECK_EQ(API, spatial_shapes_desc->dims[1], 2); - - PARAM_CHECK_EQ(API, level_start_index_desc->dims[0], num_levels); - - PARAM_CHECK_EQ(API, sampling_loc_desc->dims[0], batch); - PARAM_CHECK_EQ(API, sampling_loc_desc->dims[1], num_query); - PARAM_CHECK_EQ(API, sampling_loc_desc->dims[2], num_heads); - PARAM_CHECK_EQ(API, sampling_loc_desc->dims[3], num_levels); - PARAM_CHECK_EQ(API, sampling_loc_desc->dims[4], num_points); - PARAM_CHECK_EQ(API, sampling_loc_desc->dims[5], 2); - - PARAM_CHECK_EQ(API, grad_output_desc->dims[0], batch); - PARAM_CHECK_EQ(API, grad_output_desc->dims[1], num_query); - PARAM_CHECK_EQ(API, grad_output_desc->dims[2], num_heads); - PARAM_CHECK_EQ(API, grad_output_desc->dims[3], channels); - - TENSOR_NUM_CHECK(API, mluOpGetTensorElementNum(value_desc), LARGE_TENSOR_NUM, - ""); - TENSOR_NUM_CHECK(API, mluOpGetTensorElementNum(sampling_loc_desc), - LARGE_TENSOR_NUM, ""); - - // check zero - if (batch * channels * num_heads * num_query == 0) { - LOG(ERROR) << "[mluOpMsDeformAttnBackward] The batch, channels, num_key, " - "num_heads or " - "num_query of the input is zero."; - return MLUOP_STATUS_BAD_PARAM; - } - if ((num_levels == 0) || ((num_points == 0) && num_key == 0)) { - *calc_grad_value_loc_weight_flag = true; - return MLUOP_STATUS_SUCCESS; - } - if ((num_points == 0) && (num_key != 0)) { - *calc_grad_loc_weight_flag = true; - return MLUOP_STATUS_SUCCESS; - } - if ((num_key == 0) && (num_points != 0)) { - *calc_grad_value_flag = true; - return MLUOP_STATUS_SUCCESS; - } - - PARAM_CHECK(API, value != NULL); - PARAM_CHECK(API, spatial_shapes != NULL); - PARAM_CHECK(API, level_start_index != NULL); - PARAM_CHECK(API, sampling_loc != NULL); - PARAM_CHECK(API, attn_weight != NULL); - PARAM_CHECK(API, grad_output != NULL); - PARAM_CHECK(API, grad_value != NULL); - PARAM_CHECK(API, grad_sampling_loc != NULL); - PARAM_CHECK(API, grad_attn_weight != NULL); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpMsDeformAttnBackward( - mluOpHandle_t handle, const mluOpTensorDescriptor_t value_desc, - const void *value, const mluOpTensorDescriptor_t spatial_shapes_desc, - const void *spatial_shapes, - const mluOpTensorDescriptor_t level_start_index_desc, - const void *level_start_index, - const mluOpTensorDescriptor_t sampling_loc_desc, const void *sampling_loc, - const mluOpTensorDescriptor_t attn_weight_desc, const void *attn_weight, - const mluOpTensorDescriptor_t grad_output_desc, const void *grad_output, - const int32_t im2col_step, const mluOpTensorDescriptor_t grad_value_desc, - void *grad_value, const mluOpTensorDescriptor_t grad_sampling_loc_desc, - void *grad_sampling_loc, - const mluOpTensorDescriptor_t grad_attn_weight_desc, - void *grad_attn_weight) { - // entrance param check - bool calc_grad_value_flag = false; - bool calc_grad_loc_weight_flag = false; - bool calc_grad_value_loc_weight_flag = false; - mluOpStatus_t param_check_status = msDeformAttnBackwardParamCheck( - handle, value_desc, value, spatial_shapes_desc, spatial_shapes, - level_start_index_desc, level_start_index, sampling_loc_desc, - sampling_loc, attn_weight_desc, attn_weight, grad_output_desc, - grad_output, im2col_step, grad_value_desc, grad_value, - grad_sampling_loc_desc, grad_sampling_loc, grad_attn_weight_desc, - grad_attn_weight, &calc_grad_loc_weight_flag, &calc_grad_value_flag, - &calc_grad_value_loc_weight_flag); - - if (MLUOP_GEN_CASE_ON_NEW) { - GEN_CASE_START("ms_deform_attn_backward"); - GEN_CASE_HANDLE(handle); - GEN_CASE_DATA_REAL(true, "value", value, value_desc); - GEN_CASE_DATA_REAL(true, "spatial_shapes", spatial_shapes, - spatial_shapes_desc); - GEN_CASE_DATA_REAL(true, "level_start_index", level_start_index, - level_start_index_desc); - GEN_CASE_DATA_REAL(true, "sampling_loc", sampling_loc, sampling_loc_desc); - GEN_CASE_DATA_REAL(true, "attn_weight", attn_weight, attn_weight_desc); - GEN_CASE_DATA_REAL(true, "grad_output", grad_output, grad_output_desc); - GEN_CASE_DATA(false, "grad_value", grad_value, grad_value_desc, 0, 0); - GEN_CASE_DATA(false, "grad_sampling_loc", grad_sampling_loc, - grad_sampling_loc_desc, 0, 0); - GEN_CASE_DATA(false, "grad_attn_weight", grad_attn_weight, - grad_attn_weight_desc, 0, 0); - GEN_CASE_OP_PARAM_SINGLE(0, "ms_deform_attn_backward", "im2col_step", - im2col_step); - GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0); - } - if (MLUOP_STATUS_SUCCESS != param_check_status) { - return param_check_status; - } - - if (calc_grad_loc_weight_flag) { - uint64_t fill_value = 0x0; - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_value_desc, - cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value, - cnnl_output_desc, grad_value)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - GEN_CASE_END(); - return MLUOP_STATUS_SUCCESS; - } - if (calc_grad_value_flag) { - uint64_t fill_value = 0x0; - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_sampling_loc_desc, - cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value, - cnnl_output_desc, grad_sampling_loc)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_attn_weight_desc, - cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value, - cnnl_output_desc, grad_attn_weight)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - GEN_CASE_END(); - return MLUOP_STATUS_SUCCESS; - } - if (calc_grad_value_loc_weight_flag) { - GEN_CASE_END(); - return MLUOP_STATUS_SUCCESS; - } - VLOG(5) << "[mluOpMsDeformAttnBackward] cnnlFill_v3 start."; - uint64_t fill_value = 0x0; - - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_value_desc, - cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value, - cnnl_output_desc, grad_value)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_sampling_loc_desc, - cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value, - cnnl_output_desc, grad_sampling_loc)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - - { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_attn_weight_desc, - cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value, - cnnl_output_desc, grad_attn_weight)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - - VLOG(5) << "[mluOpMsDeformAttnBackward] cnnlFill_v3 end."; - cnrtDim3_t k_dim; - cnrtFunctionType_t k_type; - const int32_t spatial_size = value_desc->dims[1]; - const int32_t batch = attn_weight_desc->dims[0]; - const int32_t channels = value_desc->dims[3]; - const int32_t num_query = attn_weight_desc->dims[1]; - const int32_t num_heads = attn_weight_desc->dims[2]; - const int32_t num_levels = attn_weight_desc->dims[3]; - const int32_t num_points = attn_weight_desc->dims[4]; - // generate mluOpMsDeformAttnBackward prototxt start! - - VLOG(5) << "[mluOpMsDeformAttnBackward] batch: " << batch; - VLOG(5) << "[mluOpMsDeformAttnBackward] channels: " << channels; - VLOG(5) << "[mluOpMsDeformAttnBackward] num_query: " << num_query; - VLOG(5) << "[mluOpMsDeformAttnBackward] num_heads: " << num_heads; - VLOG(5) << "[mluOpMsDeformAttnBackward] num_levels: " << num_levels; - VLOG(5) << "[mluOpMsDeformAttnBackward] num_points: " << num_points; - VLOG(5) << "[mluOpMsDeformAttnBackward] spatial_size: " << spatial_size; - - mluOpDeformAttnBackwardKernelPolicy_t kernelPolicy = - msDeformAttnBackwardPolicyFunc(handle, channels, num_levels, num_points, - num_heads); - - policyFunc(handle, batch, num_query, num_heads, num_levels, &k_type, &k_dim, - kernelPolicy); - switch (kernelPolicy) { - case MLUOP_MS_DEFORM_ATTN_BACKWARD_FAST: { - VLOG(5) << "Launch Kernel MsDeformAttnBackwardFast<<>>"; - CHECK_RETURN( - "[MsDeformAttnBackwardFast]", - KernelMsDeformAttnBackwardFast( - k_dim, k_type, handle->queue, (float *)value, - (int32_t *)spatial_shapes, (int32_t *)level_start_index, - (float *)sampling_loc, (float *)attn_weight, (float *)grad_output, - batch, spatial_size, num_heads, channels, num_levels, num_query, - num_points, (float *)grad_value, (float *)grad_sampling_loc, - (float *)grad_attn_weight)); - } break; - case MLUOP_MS_DEFORM_ATTN_BACKWARD_DEFAULT: { - VLOG(5) << "Launch Kernel MsDeformAttnBackwardDefault<<>>"; - CHECK_RETURN( - "[MsDeformAttnBackwardDefault]", - KernelMsDeformAttnBackwardDefault( - k_dim, k_type, handle->queue, (float *)value, - (int32_t *)spatial_shapes, (int32_t *)level_start_index, - (float *)sampling_loc, (float *)attn_weight, (float *)grad_output, - batch, spatial_size, num_heads, channels, num_levels, num_query, - num_points, (float *)grad_value, (float *)grad_sampling_loc, - (float *)grad_attn_weight)); - } break; - case MLUOP_MS_DEFORM_ATTN_BACKWARD_SMALL_CHANNEL: { - VLOG(5) << "Launch Kernel MsDeformAttnBackwardSmallChannels<<>>"; - CHECK_RETURN( - "[MsDeformAttnBackwardSmallChannels]", - KernelMsDeformAttnBackwardSmallChannels( - k_dim, k_type, handle->queue, (float *)value, - (int32_t *)spatial_shapes, (int32_t *)level_start_index, - (float *)sampling_loc, (float *)attn_weight, (float *)grad_output, - batch, spatial_size, num_heads, channels, num_levels, num_query, - num_points, (float *)grad_value, (float *)grad_sampling_loc, - (float *)grad_attn_weight)); - } - default: { - VLOG(5) << "Not Implemented."; - } - } - - GEN_CASE_END(); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/ms_deform_attn_backward/ms_deform_attn_backward.h b/kernels/ms_deform_attn_backward/ms_deform_attn_backward.h deleted file mode 100644 index 64237ff6d..000000000 --- a/kernels/ms_deform_attn_backward/ms_deform_attn_backward.h +++ /dev/null @@ -1,61 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#ifndef KERNELS_MS_DEFORM_ATTN_BACKWARD_MS_DEFORM_ATTN_BACKWARD_H -#define KERNELS_MS_DEFORM_ATTN_BACKWARD_MS_DEFORM_ATTN_BACKWARD_H - -#include "mlu_op.h" - -#define FAST_KERNEL_MAX_NLP (128) -#define FAST_KERNEL_MAX_NLPC (16384) - -mluOpStatus_t MLUOP_WIN_API KernelMsDeformAttnBackwardSmallChannels( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - const float *data_value, const int32_t *spatial_shapes, - const int32_t *data_level_start_index, const float *data_sampling_loc, - const float *data_attn_weight, const float *grad_output, - const int32_t batch, const int32_t spatial_size, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, const int32_t num_query, - const int32_t num_points, float *grad_value, float *grad_sampling_loc, - float *grad_attn_weight); - -mluOpStatus_t MLUOP_WIN_API KernelMsDeformAttnBackwardDefault( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - const float *data_value, const int32_t *spatial_shapes, - const int32_t *data_level_start_index, const float *data_sampling_loc, - const float *data_attn_weight, const float *grad_output, - const int32_t batch, const int32_t spatial_size, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, const int32_t num_query, - const int32_t num_points, float *grad_value, float *grad_sampling_loc, - float *grad_attn_weight); - -mluOpStatus_t MLUOP_WIN_API KernelMsDeformAttnBackwardFast( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - const float *data_value, const int32_t *spatial_shapes, - const int32_t *data_level_start_index, const float *data_sampling_loc, - const float *data_attn_weight, const float *grad_output, - const int32_t batch, const int32_t spatial_size, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, const int32_t num_query, - const int32_t num_points, float *grad_value, float *grad_sampling_loc, - float *grad_attn_weight); - -#endif // KERNELS_MS_DEFORM_ATTN_BACKWARD_MS_DEFORM_ATTN_BACKWARD_H diff --git a/kernels/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu b/kernels/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu deleted file mode 100644 index 7b4a97527..000000000 --- a/kernels/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu +++ /dev/null @@ -1,626 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ - -#include "kernels/ms_deform_attn_backward/ms_deform_attn_backward.h" -#include "kernels/ms_deform_attn_forward/ms_deform_attn_utils.h" - -#include "core/logging.h" - -#if (__BANG_ARCH__ == 592) - -#define MAX_MEMCPY_SEGNUM (65536) -#define NRAM_REMAIN_SIZE (48 * 1024) -#define SRAM_REMAIN_SIZE (32 * 1024) -#define NRAM_AVALIABLE_SIZE (__MLU_NRAM_SIZE__ * 1024 - NRAM_REMAIN_SIZE) -#define WRAM_AVALIABLE_SIZE (__MLU_WRAM_SIZE__ * 1024) -#define SRAM_AVALIABLE_SIZE (__MLU_SRAM_SIZE__ * 1024 - SRAM_REMAIN_SIZE) - -__nram__ char nram_buffer[NRAM_AVALIABLE_SIZE]; -__mlu_shared__ char sram_buffer[SRAM_AVALIABLE_SIZE]; -__wram__ char wram_buffer[WRAM_AVALIABLE_SIZE]; - -__mlu_func__ void loadNram2Gpr(int32_t& v1, int32_t& v2, int32_t& v3, - int32_t& v4, const int32_t* p1, - const int32_t* p2, const int32_t* p3, - const int32_t* p4) { - v1 = __load_nram(p1); - v2 = __load_nram(p2); - v3 = __load_nram(p3); - v4 = __load_nram(p4); -} - -template -__mlu_func__ void memPolicyBackward( - int32_t*& seq_nram, T*& zeros_nram, int32_t*& data_offset_nram, - T*& weight_polation_nram, T*& cond_point_polation_nram, - T*& cond_point_valid_nram, T*& delta_xy_nram, T*& loc_nram, T*& buf_nram, - T*& buf_nram_end, int8_t*& mask_x_nram, int8_t*& mask_y_nram, - T*& spatial_offset_bd_nram, T*& spatial_w_bd_nram, T*& spatial_h_bd_nram, - int32_t*& spatial_offset_nram, int32_t*& spatial_hw_nram, - T*& compute_buffer, // (5, deal_n, num_levels, num_points, channels) - T*& weight_polation_nram_stg2, T*& weight_attn_nram_stg2, - int32_t*& offset_nram_stg2, T*& grad_output_nram, - int8_t*& bit_cond_nram, // (4, pad_points / 8) - int8_t*& bit_cond_reverse_nram, // (4, pad_points / 8) - T*& cond_nram_stg2, - T*& compute_buffer_nram_stg3, // (4, max_deal_n, num_levels, num_points) - T*& delta_xy_nram_stg3, // (4, max_deal_n, num_levels, num_points) - T*& grad_wp_nram_stg3, // (4, total_deal_n, num_levels, num_points) - int32_t*& data_offset_sram, T*& weight_polation_sram, T*& grad_wp_sram, - T*& weight_attn_sram, T*& cond_point_polation_sram, T*& delta_xy_sram, - char* nram_buffer, char* sram_buffer, int32_t& max_cached_n, - int32_t& stage_1_max_deal_n, int32_t& stage_2_max_deal_n, - int32_t& stage_3_max_deal_n, int32_t& mask_size, - const int32_t nram_avaliable_size, const int32_t sram_avaliable_size, - const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, const int32_t num_queries, - const int32_t num_points) { - const int32_t num_points_levels = num_levels * num_points; - const int32_t spatial_info_size = - PAD_UP(3 * num_levels * sizeof(int32_t), WRAM_ALIGN_SIZE); - const int32_t spatial_info_bd_size = - PAD_UP(3 * num_points_levels * sizeof(T), WRAM_ALIGN_SIZE); - const int32_t zeros_size = PAD_UP(channels * sizeof(T), WRAM_ALIGN_SIZE); - const int32_t seq_size = BACKWARD_MAX_NQ_NL_NP * sizeof(int32_t); - const int32_t fix_space_size = spatial_info_size + - 2 * BIT_COLLECT_PAD * sizeof(T) + - spatial_info_bd_size + zeros_size + seq_size; - const int32_t left_space_size = nram_avaliable_size - fix_space_size; - stage_1_max_deal_n = left_space_size / (24 * num_points_levels * sizeof(T)); - const int32_t total_points = stage_1_max_deal_n * num_points_levels; - const int32_t total_coord_pad = PAD_UP(total_points * 2, BIT_COLLECT_PAD); - mask_size = PAD_UP(total_coord_pad / BIT_COLLECT_PAD, WRAM_ALIGN_SIZE); - stage_2_max_deal_n = - (left_space_size - 2 * mask_size - 7 * WRAM_ALIGN_SIZE) / - ((5 * num_points_levels * channels + 20 * num_points_levels + channels) * - sizeof(T)); - stage_2_max_deal_n = - std::min(BACKWARD_MAX_NQ_NL_NP / num_points_levels, stage_2_max_deal_n); - stage_3_max_deal_n = (left_space_size - 2 * mask_size - 2 * WRAM_ALIGN_SIZE) / - (12 * num_points_levels * sizeof(T)); - // fix nram space - seq_nram = (int32_t*)(nram_buffer); - zeros_nram = (T*)(seq_nram + seq_size / sizeof(int32_t)); - spatial_offset_nram = (int32_t*)(zeros_nram + zeros_size / sizeof(T)); - spatial_hw_nram = spatial_offset_nram + num_levels; - spatial_offset_bd_nram = - (T*)((int8_t*)spatial_offset_nram + spatial_info_size); - spatial_w_bd_nram = spatial_offset_bd_nram + num_points_levels; - spatial_h_bd_nram = spatial_w_bd_nram + num_points_levels; - mask_x_nram = (int8_t*)spatial_offset_bd_nram + spatial_info_bd_size; - mask_y_nram = mask_x_nram + mask_size; - // stage1 nram space - // 4 + 4 + 4 + 4 + 1 + 6 - data_offset_nram = (int32_t*)(mask_y_nram + mask_size); - delta_xy_nram = (T*)(data_offset_nram + 4 * total_points); - weight_polation_nram = delta_xy_nram + 4 * total_points; - cond_point_polation_nram = weight_polation_nram + 4 * total_points; - cond_point_valid_nram = cond_point_polation_nram + 4 * total_points; - buf_nram = cond_point_valid_nram + total_points; - loc_nram = buf_nram + 4 * total_points; - buf_nram_end = buf_nram + 6 * total_points + total_coord_pad; - // stage2 nram space - const int32_t total_points_stg2 = stage_2_max_deal_n * num_points_levels; - const int32_t compute_buffer_size_pad = - 5 * PAD_UP(total_points_stg2 * channels * sizeof(T), WRAM_ALIGN_SIZE); - const int32_t bit_cond_pad_size = - PAD_UP(PAD_UP(total_points_stg2, BIT_COLLECT_PAD) / BIT_COLLECT_PAD * 5, - WRAM_ALIGN_SIZE); - cond_nram_stg2 = (T*)(mask_y_nram + mask_size); - bit_cond_nram = (int8_t*)cond_nram_stg2 + - PAD_UP(5 * total_points_stg2 * sizeof(T), WRAM_ALIGN_SIZE); - bit_cond_reverse_nram = bit_cond_nram + bit_cond_pad_size; - compute_buffer = (T*)(bit_cond_reverse_nram + bit_cond_pad_size); - grad_output_nram = compute_buffer + compute_buffer_size_pad / sizeof(T); - weight_polation_nram_stg2 = grad_output_nram + stage_2_max_deal_n * channels; - weight_attn_nram_stg2 = weight_polation_nram_stg2 + 4 * total_points_stg2; - offset_nram_stg2 = (int32_t*)(weight_attn_nram_stg2 + total_points_stg2); - // stage3 nram space - const int32_t total_points_stg3 = stage_3_max_deal_n * num_points_levels; - compute_buffer_nram_stg3 = (T*)(mask_y_nram + mask_size); - delta_xy_nram_stg3 = compute_buffer_nram_stg3 + 4 * total_points_stg3; - grad_wp_nram_stg3 = delta_xy_nram_stg3 + 4 * total_points_stg3; - // sram space: 4 + 4 + 1 + 5 + 4 - const int32_t polation_info_size = 18 * num_points_levels * sizeof(T); - const int32_t avg_sram_size = sram_avaliable_size / coreDim; - max_cached_n = avg_sram_size / polation_info_size; - const int32_t max_cached_points = max_cached_n * num_points_levels; - T* sram_buf_base = (T*)(sram_buffer + avg_sram_size * coreId); - data_offset_sram = (int32_t*)sram_buf_base; - weight_polation_sram = (T*)(data_offset_sram + 4 * max_cached_points); - weight_attn_sram = (T*)(weight_polation_sram + 4 * max_cached_points); - cond_point_polation_sram = (T*)(weight_attn_sram + max_cached_points); - delta_xy_sram = (T*)(cond_point_polation_sram + 5 * max_cached_points); - grad_wp_sram = weight_polation_sram; // reuse -} - -template -__mlu_func__ void backwardStageTwoLoop( - int32_t* seq_nram, T* compute_buffer_nram, T* zeros_nram, - T* weight_polation_nram, T* weight_attn_nram, int32_t* offset_nram, - T* cond_nram, int8_t* bit_cond_nram, int8_t* bit_cond_reverse_nram, - T* grad_output_nram, T* delta_xy_nram, int32_t* data_offset_sram, - T* weight_polation_sram, T* grad_wp_sram, T* weight_attn_sram, - T* cond_point_polation_sram, T* delta_xy_sram, T* data_value_gdram, - T* grad_output_gdram, T* grad_value_gdram, T* grad_attn_weight_gdram, - char* wram_buffer, const int32_t total_deal_n, const int32_t max_deal_n, - const int32_t input_stride_2, const int32_t input_stride_3, - const int32_t output_stride_2, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, - const int32_t num_points) { - const int32_t num_levels_points = num_levels * num_points; - const int32_t loop_num = (total_deal_n + max_deal_n - 1) / max_deal_n; - int32_t* offset_zero_nram_stg2 = - offset_nram + 4 * max_deal_n * num_levels_points; - const int32_t src_stride = total_deal_n * num_levels_points * sizeof(T); - for (int i = 0; i < loop_num; i++) { - int32_t deal_n = std::min(total_deal_n - i * max_deal_n, max_deal_n); - int32_t copy_size_1 = deal_n * num_levels_points * sizeof(T); - int32_t copy_size_2 = deal_n * num_levels_points * sizeof(int32_t); - int32_t sram_src_offset = i * max_deal_n * num_levels_points; - int32_t nq_nl_np_c = deal_n * num_levels_points * channels; - int32_t nq_nl_np = deal_n * num_levels_points; - int32_t nq_nl_np_4 = 4 * deal_n * num_levels_points; - - __memcpy_async(grad_output_nram, - grad_output_gdram + i * max_deal_n * output_stride_2, - channels * sizeof(T), GDRAM2NRAM, channels * sizeof(T), - output_stride_2 * sizeof(T), deal_n - 1); - __memcpy_async(offset_nram, data_offset_sram + sram_src_offset, copy_size_2, - SRAM2NRAM, copy_size_2, src_stride, 3); - __memcpy_async(cond_nram, cond_point_polation_sram + sram_src_offset, - copy_size_1, SRAM2NRAM, copy_size_1, src_stride, 4); - __memcpy_async(weight_attn_nram, weight_attn_sram + sram_src_offset, - copy_size_1, SRAM2NRAM); - __memcpy_async(weight_polation_nram, weight_polation_sram + sram_src_offset, - copy_size_1, SRAM2NRAM, copy_size_1, src_stride, 3); - __bang_write_value(offset_zero_nram_stg2, nq_nl_np_4, (int32_t)0); - __sync_move(); - - T* tmp_zero = (T*)offset_zero_nram_stg2; - int32_t nq_nl_np_pad8 = PAD_UP(nq_nl_np, BIT_COLLECT_PAD); - int32_t bit_cond_stride = nq_nl_np_pad8 / BIT_COLLECT_PAD; - if (nq_nl_np_pad8 == nq_nl_np) { - int32_t bit_cond_stride_4 = 4 * bit_cond_stride; - __bang_gt_bitindex((T*)bit_cond_nram, cond_nram, tmp_zero, nq_nl_np_4); - __bang_bnot((char*)bit_cond_reverse_nram, (char*)bit_cond_nram, - 4 * bit_cond_stride); - __bang_gt_bitindex((T*)(bit_cond_nram + bit_cond_stride_4), - cond_nram + nq_nl_np_4, tmp_zero, nq_nl_np); - __bang_bnot((char*)(bit_cond_reverse_nram + bit_cond_stride_4), - (char*)(bit_cond_nram + bit_cond_stride_4), bit_cond_stride); - } else { - for (int j = 0; j < 5; j++) { - __bang_gt_bitindex((T*)((int8_t*)bit_cond_nram + j * bit_cond_stride), - cond_nram + j * nq_nl_np, tmp_zero, nq_nl_np_pad8); - __bang_bnot((char*)bit_cond_reverse_nram + j * bit_cond_stride, - (char*)bit_cond_nram + j * bit_cond_stride, - bit_cond_stride); - } - } - - __sync_io_move_compute(); - - int32_t buffer_size_pad = PAD_UP(nq_nl_np_c * sizeof(T), WRAM_ALIGN_SIZE); - int32_t buffer_data_num = buffer_size_pad / sizeof(T); - T* inter_grad = compute_buffer_nram; - T* v_ping = inter_grad + buffer_data_num; - T* v_pong = v_ping + buffer_data_num; - T* value_wp = v_pong + buffer_data_num; - T* buffer = value_wp + buffer_data_num; - - for (int j = 0; j < 5; j++) { - T* tmp_wp = weight_polation_nram + (j - 1) * nq_nl_np; - if (j < 4) { - gatherAsync(v_ping, zeros_nram, (unsigned int*)offset_zero_nram_stg2, - bit_cond_reverse_nram + j * bit_cond_stride, - channels * sizeof(T), NRAM2NRAM, channels * sizeof(T), - nq_nl_np); - gatherAsync(v_ping, data_value_gdram, - (unsigned int*)offset_nram + j * nq_nl_np, - bit_cond_nram + j * bit_cond_stride, channels * sizeof(T), - GDRAM2NRAM, channels * sizeof(T), nq_nl_np); - } - - if (j == 0) { - // (n, c) => (n, nl, np, c) - __memcpy_async(buffer, grad_output_nram, channels * sizeof(T), - NRAM2NRAM, channels * sizeof(T), num_levels_points - 1, - num_levels_points * channels * sizeof(T), deal_n - 1, 0, - num_levels_points - 1, channels * sizeof(T), deal_n - 1); - gatherAsync(buffer, zeros_nram, (unsigned int*)offset_zero_nram_stg2, - bit_cond_reverse_nram + 4 * bit_cond_stride, - channels * sizeof(T), NRAM2NRAM, channels * sizeof(T), - nq_nl_np); - __bang_write_value(value_wp, nq_nl_np_c, (T)0); // clear value*wp - __sync_move(); - // (n, nl, np, c) => (c, n, nl, np) - __bang_transpose(v_pong, buffer, nq_nl_np, channels); - __sync_compute(); - // (c, n, nl, np) * (n, nl, np) - __bang_cycle_mul(inter_grad, v_pong, weight_attn_nram, nq_nl_np_c, - nq_nl_np); - __memcpy_async(wram_buffer, v_pong, buffer_size_pad, NRAM2WRAM); - } - - if (j == 4) { - __memcpy_async(v_ping, wram_buffer, buffer_size_pad, WRAM2NRAM); - } - - if (j > 0) { - // (n, nl, np, c) => (c, n, nl, np) - __bang_transpose(buffer, v_pong, nq_nl_np, channels); - // (c, n, nl, np) * (n, nl, np) - __bang_cycle_mul(v_pong, buffer, tmp_wp, nq_nl_np_c, nq_nl_np); - __bang_add(value_wp, value_wp, v_pong, nq_nl_np_c); - __bang_mul(v_pong, buffer, inter_grad, nq_nl_np_c); - // (c, nq, nl, np) => (nq, nl, np) - __bang_sumpool(buffer, v_pong, nq_nl_np, channels, 1, channels, 1, 1, - 1); - __bang_float2int32((int32_t*)v_pong, cond_nram + (j - 1) * nq_nl_np, - nq_nl_np, 0); - __bang_mul_scalar((int32_t*)v_pong, (int32_t*)v_pong, - (int32_t)0xffffffff, nq_nl_np); - __bang_band((char*)buffer, (char*)buffer, (char*)v_pong, - nq_nl_np * sizeof(T)); - // (nq, nl, np) => (Nq, nl, np) - __sync_compute(); - __memcpy_async(grad_wp_sram + sram_src_offset + - (j - 1) * total_deal_n * num_levels_points, - buffer, nq_nl_np * sizeof(T), NRAM2SRAM); - } - - T* tmp = v_ping; - v_ping = v_pong; - v_pong = tmp; - - __sync_io_move_compute(); - } - - // compute grad_attn_weight - T* grad_output_bd = v_pong; - __bang_mul(v_ping, value_wp, grad_output_bd, nq_nl_np_c); - // (c, nq, nl, np) => (nq, nl, np) - __bang_sumpool(buffer, v_ping, nq_nl_np, channels, 1, channels, 1, 1, 1); - __memcpy(grad_attn_weight_gdram + i * max_deal_n * input_stride_3, buffer, - num_levels_points * sizeof(T), NRAM2GDRAM, - input_stride_3 * sizeof(T), num_levels_points * sizeof(T), - deal_n - 1); - - // compute grad_value - T* grad_value_buffer = inter_grad + buffer_data_num; - int32_t neighbor_order[4] = {1, 3, 0, 2}; - for (int k = 0; k < 4; k++) { - int neighbor_idx = neighbor_order[k]; - T* grad_value_tmp = k < 3 ? buffer : inter_grad; - __bang_cycle_mul(grad_value_tmp, inter_grad, - weight_polation_nram + neighbor_idx * nq_nl_np, - nq_nl_np_c, nq_nl_np); - __bang_transpose(grad_value_buffer + k * buffer_data_num, grad_value_tmp, - channels, nq_nl_np); - } - - // store all valid point - T* cond_all_valid = cond_nram + nq_nl_np_4; - __bang_and(cond_all_valid, cond_nram, cond_nram + nq_nl_np, nq_nl_np); - __bang_and(cond_all_valid, cond_all_valid, cond_nram + 2 * nq_nl_np, - nq_nl_np); - __bang_and(cond_all_valid, cond_all_valid, cond_nram + 3 * nq_nl_np, - nq_nl_np); - int32_t all_valid_count = __bang_sum(cond_all_valid, nq_nl_np); - int32_t* dst_offset = (int32_t*)offset_zero_nram_stg2; - for (int i = 0; i < 4; i++) { - __bang_collect((T*)dst_offset + i * nq_nl_np, - (T*)offset_nram + i * nq_nl_np, cond_all_valid, nq_nl_np); - } - int32_t* src_offset = (int32_t*)inter_grad; - int32_t* stride_4_2 = dst_offset + 3 * nq_nl_np; - int32_t* stride_1_2 = dst_offset; - __bang_collect((T*)src_offset, (T*)seq_nram, cond_all_valid, nq_nl_np); - __bang_mul_scalar(src_offset, src_offset, channels * sizeof(T), nq_nl_np); - __bang_sub(stride_4_2, stride_4_2, dst_offset + nq_nl_np, nq_nl_np); - __bang_sub(stride_1_2, stride_1_2, dst_offset + nq_nl_np, nq_nl_np); - int src_stride_1 = buffer_size_pad; - int src_stride_2 = src_stride_1 * 2; - int32_t* dst_offset_base = dst_offset + nq_nl_np; - int32_t dst_offset_2, src_offset_2, dst_stride_4_2, dst_stride_1_2; - for (int s = 0; s < all_valid_count; s++) { - loadNram2Gpr(dst_offset_2, src_offset_2, dst_stride_4_2, dst_stride_1_2, - dst_offset_base + s, src_offset + s, stride_4_2 + s, - stride_1_2 + s); - __bang_atomic_reduce_add((T*)((int8_t*)grad_value_gdram + dst_offset_2), - (T*)((int8_t*)grad_value_buffer + src_offset_2), - channels, 1, 1, dst_stride_4_2, dst_stride_1_2, - src_stride_1, src_stride_2); - } - - // store partial valid point - __bang_not(cond_all_valid, cond_all_valid, nq_nl_np); - __bang_cycle_and(cond_nram, cond_nram, cond_all_valid, nq_nl_np_4, - nq_nl_np); - for (int k = 0; k < 4; k++) { - int32_t offset = neighbor_order[k] * nq_nl_np; - T* grad_value_tmp = grad_value_buffer + k * buffer_data_num; - T* tmp_cond = cond_nram + offset; - int32_t* tmp_dst_offset = offset_nram + offset; - int32_t* tmp_src_offset = (int32_t*)inter_grad; - int32_t valid_count = __bang_sum(tmp_cond, nq_nl_np); - if (valid_count > 0) { - __bang_collect((T*)tmp_dst_offset, (T*)tmp_dst_offset, tmp_cond, - nq_nl_np); - __bang_collect((T*)tmp_src_offset, (T*)seq_nram, tmp_cond, nq_nl_np); - __bang_mul_scalar(tmp_src_offset, tmp_src_offset, channels * sizeof(T), - valid_count); - for (int p = 0; p < valid_count; p++) { - __bang_atomic_reduce_add( - (T*)((int8_t*)grad_value_gdram + tmp_dst_offset[p]), - (T*)((int8_t*)grad_value_tmp + tmp_src_offset[p]), channels); - } - } - } - __sync_io_move_compute(); - } -} - -template -__mlu_func__ void backwardStageThreeLoop( - T* compute_buffer_nram, T* delta_xy_nram, T* grad_wp_nram, - T* spatial_h_bd_nram, T* spatial_w_bd_nram, T* delta_xy_sram, - T* grad_wp_sram, T* grad_loc_gdram, const int32_t total_deal_n, - const int32_t max_deal_n, const int32_t input_stride_2, - const int32_t input_stride_3, const int32_t output_stride_2, - const int32_t num_heads, const int32_t channels, const int32_t num_levels, - const int32_t num_points) { - const int32_t loop_num = (total_deal_n + max_deal_n - 1) / max_deal_n; - const int32_t num_levels_points = num_levels * num_points; - const int32_t src_stride = total_deal_n * num_levels_points * sizeof(T); - /* - grad_dx = (grad_w3-grad_w1)*dy + (grad_w4-grad_w2)*(1-dy) - grad_loc_x = grad_dx * W - grad_dy = (grad_w3-grad_w4)*dx + (grad_w1-grad_w2)*(1-dx) - grad_loc_y = grad_dy * H - */ - for (int i = 0; i < loop_num; i++) { - int32_t deal_n = std::min(total_deal_n - i * max_deal_n, max_deal_n); - int32_t copy_size = deal_n * num_levels_points * sizeof(T); - int32_t sram_src_offset = i * max_deal_n * num_levels_points; - int32_t nq_nl_np = deal_n * num_levels_points; - T* grad_wp_1 = grad_wp_nram; - T* grad_wp_2 = grad_wp_nram + nq_nl_np; - T* grad_wp_3 = grad_wp_nram + 2 * nq_nl_np; - T* grad_wp_4 = grad_wp_nram + 3 * nq_nl_np; - T* dx = delta_xy_nram; - T* dx_1 = delta_xy_nram + nq_nl_np; - T* dy = delta_xy_nram + 2 * nq_nl_np; - T* dy_1 = delta_xy_nram + 3 * nq_nl_np; - T* buf_1 = compute_buffer_nram; - T* buf_2 = compute_buffer_nram + nq_nl_np; - T* buf_3 = compute_buffer_nram + 2 * nq_nl_np; - __memcpy(delta_xy_nram, delta_xy_sram + sram_src_offset, copy_size, - SRAM2NRAM, copy_size, src_stride, 3); - __memcpy(grad_wp_nram, grad_wp_sram + sram_src_offset, copy_size, SRAM2NRAM, - copy_size, src_stride, 3); - __bang_fusion(FUSION_FSM, buf_1, grad_wp_3, grad_wp_1, dy, nq_nl_np, - nq_nl_np); - __bang_fusion(FUSION_FSM, buf_2, grad_wp_4, grad_wp_2, dy_1, nq_nl_np, - nq_nl_np); - __bang_add(buf_1, buf_1, buf_2, nq_nl_np); - __bang_cycle_mul(buf_1, buf_1, spatial_w_bd_nram, nq_nl_np, - num_levels_points); - __bang_fusion(FUSION_FSM, buf_2, grad_wp_3, grad_wp_4, dx, nq_nl_np, - nq_nl_np); - __bang_fusion(FUSION_FSM, buf_3, grad_wp_1, grad_wp_2, dx_1, nq_nl_np, - nq_nl_np); - __bang_add(buf_2, buf_2, buf_3, nq_nl_np); - __bang_cycle_mul(buf_2, buf_2, spatial_h_bd_nram, nq_nl_np, - num_levels_points); - __bang_transpose(buf_3, buf_1, 2, - nq_nl_np); // (2, nq_nl_np) -> (nq_nl_np, 2) - __memcpy(grad_loc_gdram + i * max_deal_n * input_stride_3 * 2, buf_3, - input_stride_2 * 2 * sizeof(T), NRAM2GDRAM, - input_stride_3 * 2 * sizeof(T), input_stride_2 * 2 * sizeof(T), - deal_n - 1); - } -} - -#endif - -__mlu_global__ void MLUUnion1KernelMsDeformAttnBackwardFastKernel( - const float* data_value, const int32_t* spatial_shapes, - const int32_t* data_level_start_index, const float* data_sampling_loc, - const float* data_attn_weight, const float* grad_output, - const int32_t batch, const int32_t spatial_size, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, const int32_t num_query, - const int32_t num_points, float* grad_value, float* grad_sampling_loc, - float* grad_attn_weight) { -#if (__BANG_ARCH__ == 592) - using T = float; - const int32_t num_keys = spatial_size; - const int32_t input_stride_4 = - num_query * num_heads * num_levels * num_points; - const int32_t input_stride_3 = num_heads * num_levels * num_points; - const int32_t input_stride_2 = num_levels * num_points; - const int32_t output_stride_3 = num_query * num_heads * channels; - const int32_t output_stride_2 = num_heads * channels; - const int32_t data_value_stride_3 = num_keys * num_heads * channels; - - int32_t* seq_nram = nullptr; // (1024) - T* zeros_nram = nullptr; // (channels) - int32_t* data_offset_nram = nullptr; // (4, deal_n, num_levels, num_points) - T* weight_polation_nram = nullptr; // (4, deal_n, num_levels, num_points) - T* cond_point_polation_nram = nullptr; // (4, deal_n, num_levels, num_points) - T* cond_point_valid_nram = nullptr; // (deal_n, num_levels, num_points) - T* loc_nram = nullptr; // (deal_n, num_levels, num_points, 2) - T* buf_nram = nullptr; // (6, deal_n, num_levels, num_points) - T* buf_nram_end = nullptr; - int8_t* mask_x_nram = nullptr; // (deal_n, num_levels, num_points, 2) / 8 - int8_t* mask_y_nram = nullptr; // (deal_n, num_levels, num_points, 2) / 8 - T* spatial_offset_bd_nram = nullptr; // (num_levels, num_points) - T* spatial_w_bd_nram = nullptr; // (num_levels, num_points) - T* spatial_h_bd_nram = nullptr; // (num_levels, num_points) - int32_t* spatial_offset_nram = nullptr; // (num_levels) - int32_t* spatial_hw_nram = nullptr; // (num_levels, 2) - T* compute_buffer_nram_stg2 = - nullptr; // (deal_n, num_levels, num_points, channels) - T* weight_polation_nram_stg2 = - nullptr; // (4, deal_n, num_levels, num_points) - T* delta_xy_nram = nullptr; // (4, deal_n, num_levels, num_points) - T* weight_attn_nram_stg2 = nullptr; // (1, deal_n, num_levels, num_points) - int32_t* offset_nram_stg2 = nullptr; // (4, deal_n, num_levels, num_points) - T* grad_output_nram = nullptr; // (deal_n, channels) - T* cond_nram_stg2 = nullptr; // (4, deal_n, num_levels, num_points) - T* compute_buffer_nram_stg3 = - nullptr; // (4, max_deal_n, num_levels, num_points) - T* delta_xy_nram_stg3 = nullptr; // (4, max_deal_n, num_levels, num_points) - T* grad_wp_nram_stg3 = nullptr; - T* value_sram = nullptr; // (num_keys, channels) - int32_t* data_offset_sram = nullptr; - T* weight_polation_sram = nullptr; - T* grad_wp_sram = nullptr; - T* weight_attn_sram = nullptr; - T* cond_point_polation_sram = nullptr; - T* delta_xy_sram = nullptr; - int8_t* bit_cond_nram = nullptr; // (4, pad_points / 8) - int8_t* bit_cond_reverse_nram = nullptr; // (4, pad_points / 8) - int32_t stage_1_max_deal_n = 0; - int32_t stage_2_max_deal_n = 0; - int32_t stage_3_max_deal_n = 0; - int32_t max_cached_n = 0; - int32_t mask_size = 0; - memPolicyBackward( - seq_nram, zeros_nram, data_offset_nram, weight_polation_nram, - cond_point_polation_nram, cond_point_valid_nram, delta_xy_nram, loc_nram, - buf_nram, buf_nram_end, mask_x_nram, mask_y_nram, spatial_offset_bd_nram, - spatial_w_bd_nram, spatial_h_bd_nram, spatial_offset_nram, - spatial_hw_nram, compute_buffer_nram_stg2, weight_polation_nram_stg2, - weight_attn_nram_stg2, offset_nram_stg2, grad_output_nram, bit_cond_nram, - bit_cond_reverse_nram, cond_nram_stg2, compute_buffer_nram_stg3, - delta_xy_nram_stg3, grad_wp_nram_stg3, data_offset_sram, - weight_polation_sram, grad_wp_sram, weight_attn_sram, - cond_point_polation_sram, delta_xy_sram, nram_buffer, sram_buffer, - max_cached_n, stage_1_max_deal_n, stage_2_max_deal_n, stage_3_max_deal_n, - mask_size, NRAM_AVALIABLE_SIZE, SRAM_AVALIABLE_SIZE, batch, num_keys, - num_heads, channels, num_levels, num_query, num_points); - - if (stage_1_max_deal_n <= 0 || stage_2_max_deal_n <= 0) { - return; - } - - int32_t cluster_begin_batch_head = 0; - int32_t cluster_act_batch_head = 0; - int32_t cluster_end_batch_head = 0; - int32_t core_begin_query = 0; - int32_t core_act_query = 0; - int32_t core_loop_num = 0; - int32_t core_step_query = 0; - splitTaskV2(cluster_begin_batch_head, cluster_act_batch_head, - cluster_end_batch_head, core_begin_query, core_act_query, - core_loop_num, core_step_query, max_cached_n, batch, num_keys, - num_heads, channels, num_levels, num_query, num_points); - - prepareLoopV2(seq_nram, zeros_nram, spatial_offset_nram, spatial_hw_nram, - mask_x_nram, mask_y_nram, spatial_offset_bd_nram, - spatial_h_bd_nram, spatial_w_bd_nram, value_sram, - data_level_start_index, spatial_shapes, num_keys, num_levels, - num_points, stage_1_max_deal_n, mask_size, channels); - - for (int32_t bh_idx = cluster_begin_batch_head; - bh_idx < cluster_end_batch_head; bh_idx++) { - int32_t b = bh_idx / num_heads; - int32_t head_idx = bh_idx % num_heads; - size_t output_base_offset = - (size_t)b * output_stride_3 + head_idx * channels; - size_t attn_weight_base_offset = - (size_t)b * input_stride_4 + head_idx * input_stride_2; - size_t data_value_base_offset = - (size_t)b * data_value_stride_3 + head_idx * channels; - - for (int32_t i = 0; __is_ipu() && i < core_loop_num; i++) { - int32_t deal_n = - std::min(core_act_query - core_step_query * i, core_step_query); - int32_t core_query_offset = i * core_step_query; - size_t attn_weight_offset = - attn_weight_base_offset + - (core_begin_query + core_query_offset) * input_stride_3; - size_t loc_offset = attn_weight_offset * 2; - size_t output_offset = - output_base_offset + - (core_begin_query + core_query_offset) * output_stride_2; - - // compute offset/cond/wp - stageOneLoop((T*)data_sampling_loc + loc_offset, - (T*)data_attn_weight + attn_weight_offset, data_offset_nram, - delta_xy_nram, weight_polation_nram, - cond_point_polation_nram, cond_point_valid_nram, loc_nram, - buf_nram, buf_nram_end, mask_x_nram, mask_y_nram, - spatial_offset_bd_nram, spatial_w_bd_nram, spatial_h_bd_nram, - spatial_offset_nram, spatial_hw_nram, data_offset_sram, - delta_xy_sram, weight_polation_sram, weight_attn_sram, - cond_point_polation_sram, true, true, deal_n, - stage_1_max_deal_n, num_heads, channels, num_levels, - num_points, input_stride_2, input_stride_3); - - // compute grad_value/grad_attn_w - backwardStageTwoLoop( - seq_nram, compute_buffer_nram_stg2, zeros_nram, - weight_polation_nram_stg2, weight_attn_nram_stg2, offset_nram_stg2, - cond_nram_stg2, bit_cond_nram, bit_cond_reverse_nram, - grad_output_nram, delta_xy_nram, data_offset_sram, - weight_polation_sram, grad_wp_sram, weight_attn_sram, - cond_point_polation_sram, delta_xy_sram, - (T*)data_value + data_value_base_offset, - (T*)grad_output + output_offset, - (T*)grad_value + data_value_base_offset, - (T*)grad_attn_weight + attn_weight_offset, wram_buffer, deal_n, - stage_2_max_deal_n, input_stride_2, input_stride_3, output_stride_2, - num_heads, channels, num_levels, num_points); - - // caompute grad_loc - backwardStageThreeLoop( - compute_buffer_nram_stg3, delta_xy_nram_stg3, grad_wp_nram_stg3, - spatial_h_bd_nram, spatial_w_bd_nram, delta_xy_sram, grad_wp_sram, - (T*)grad_sampling_loc + loc_offset, deal_n, stage_3_max_deal_n, - input_stride_2, input_stride_3, output_stride_2, num_heads, channels, - num_levels, num_points); - } - } -#endif -} - -mluOpStatus_t MLUOP_WIN_API KernelMsDeformAttnBackwardFast( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - const float* data_value, const int32_t* spatial_shapes, - const int32_t* data_level_start_index, const float* data_sampling_loc, - const float* data_attn_weight, const float* grad_output, - const int32_t batch, const int32_t spatial_size, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, const int32_t num_query, - const int32_t num_points, float* grad_value, float* grad_sampling_loc, - float* grad_attn_weight) { - KERNEL_CHECK( - MLUUnion1KernelMsDeformAttnBackwardFastKernel<<>>( - data_value, spatial_shapes, data_level_start_index, data_sampling_loc, - data_attn_weight, grad_output, batch, spatial_size, num_heads, - channels, num_levels, num_query, num_points, grad_value, - grad_sampling_loc, grad_attn_weight)); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu b/kernels/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu deleted file mode 100644 index 1237cfc53..000000000 --- a/kernels/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu +++ /dev/null @@ -1,1012 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "ms_deform_attn_backward.h" - -#include "core/logging.h" -#include "kernels/kernel.h" -#include "kernels/utils/common.h" - -__nram__ char nram_buffer[MAX_NRAM_SIZE]; - -#define ALIGN_NUM 32 - -void __mlu_func__ computeGridMaskAndOffset( - float *nram_grad_output_tl, float *nram_grad_output_tr, float *nram_loc_w, - float *nram_loc_h, float *nram_h_stride, int32_t *nram_spatial_shapes, - float *nram_w_low_temp, float *nram_h_high_temp, float *nram_w_low, - float *nram_h_low, float *nram_h_high, float *nram_w_high, float *nram_lh, - float *nram_lw, float *nram_hh, float *nram_hw, - float *nram_h_low_ptr_offset, float *nram_h_high_ptr_offset, - float *nram_w_low_ptr_offset, float *nram_w_high_ptr_offset, float *nram_w1, - float *nram_w2, float *nram_w3, float *nram_w4, float *nram_offset_temp, - float *nram_offset1, float *nram_offset2, float *nram_offset3, - float *nram_offset4, float *nram_base_ptr, float *nram_h_low_temp, - const int32_t &num_deal_grid, const int32_t &num_per_time_real, - const int32_t &num_heads, const int32_t &num_levels, - const int32_t &num_points, const int32_t &w_stride, - const int32_t &qid_stride, float *grad_temp1) { - // [num_levels, 2] --> [2, num_levels] -#if __BANG_ARCH__ >= 372 - __bang_transpose(nram_grad_output_tl, nram_loc_w, num_deal_grid, - 2); // 2 * xhlp - __bang_transpose(nram_loc_w, nram_grad_output_tl, - num_per_time_real * num_heads * num_levels, num_points); - __bang_transpose(nram_loc_h, nram_grad_output_tl + num_deal_grid, - num_per_time_real * num_heads * num_levels, num_points); - - __bang_transpose((int32_t *)nram_grad_output_tr, - (int32_t *)nram_spatial_shapes, num_levels, 2); - __bang_mul_scalar((int32_t *)nram_h_stride, - (int32_t *)((int32_t *)nram_grad_output_tr + num_levels), - w_stride, num_levels); - - __memcpy_async((int32_t *)nram_spatial_shapes, (int32_t *)nram_grad_output_tr, - num_levels * 2 * sizeof(int32_t), NRAM2NRAM); - __bang_int322float((float *)nram_spatial_shapes, - (int32_t *)nram_spatial_shapes, num_levels * 2, 0); - __bang_cycle_mul((float *)nram_loc_w, (float *)nram_loc_w, - (float *)(nram_spatial_shapes + num_levels), num_deal_grid, - num_levels); - __bang_cycle_mul((float *)nram_loc_h, (float *)nram_loc_h, - (float *)(nram_spatial_shapes), num_deal_grid, num_levels); - __bang_sub_scalar((float *)nram_loc_w, (float *)nram_loc_w, 0.5, - num_deal_grid); - __bang_sub_scalar((float *)nram_loc_h, (float *)nram_loc_h, 0.5, - num_deal_grid); - - // get mask. (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) - __bang_cycle_lt((float *)nram_w_low_temp, (float *)nram_loc_w, - (float *)(nram_spatial_shapes + num_levels), num_deal_grid, - num_levels); - __bang_cycle_lt((float *)nram_h_high_temp, (float *)nram_loc_h, - (float *)(nram_spatial_shapes), num_deal_grid, num_levels); - - __bang_and((float *)nram_w_low_temp, (float *)nram_w_low_temp, - (float *)nram_h_high_temp, num_deal_grid); - __bang_gt_scalar((float *)nram_h_high_temp, (float *)nram_loc_h, -1, - num_deal_grid); - __bang_and((float *)nram_h_high_temp, (float *)nram_h_high_temp, - (float *)nram_w_low_temp, num_deal_grid); - __bang_gt_scalar((float *)nram_w_low_temp, (float *)nram_loc_w, -1, - num_deal_grid); - __bang_and((float *)nram_h_high_temp, (float *)nram_h_high_temp, - (float *)nram_w_low_temp, num_deal_grid); - - __bang_transpose((float *)nram_w_low_temp, (float *)nram_h_high_temp, - num_points, num_per_time_real * num_heads * num_levels); - __memcpy_async((float *)nram_h_high_temp, (float *)nram_w_low_temp, - num_deal_grid * sizeof(float), NRAM2NRAM); - - __bang_transpose((float *)nram_grad_output_tl, (float *)nram_loc_w, - num_points, num_per_time_real * num_heads * num_levels); - __memcpy_async((float *)nram_loc_w, (float *)nram_grad_output_tl, - num_deal_grid * sizeof(float), NRAM2NRAM); - __bang_transpose((float *)nram_grad_output_tl, (float *)nram_loc_h, - num_points, num_per_time_real * num_heads * num_levels); - __memcpy_async((float *)nram_loc_h, (float *)nram_grad_output_tl, - num_deal_grid * sizeof(float), NRAM2NRAM); - - __bang_floor(nram_w_low, nram_loc_w, num_deal_grid); - __bang_floor(nram_h_low, nram_loc_h, num_deal_grid); - __bang_sub((float *)nram_lh, (float *)nram_loc_h, (float *)nram_h_low, - num_deal_grid); - __bang_sub((float *)nram_lw, (float *)nram_loc_w, (float *)nram_w_low, - num_deal_grid); - __bang_fusion(FUSION_FMA, nram_hh, nram_lh, (float)(-1), 1, num_deal_grid); - __bang_fusion(FUSION_FMA, nram_hw, nram_lw, (float)(-1), 1, num_deal_grid); - __bang_float2int32((int32_t *)nram_w_low, nram_w_low, num_deal_grid, 0); - __bang_float2int32((int32_t *)nram_h_low, nram_h_low, num_deal_grid, 0); - - __bang_add_scalar((int32_t *)nram_h_high, (int32_t *)nram_h_low, 1, - num_deal_grid); - __bang_add_scalar((int32_t *)nram_w_high, (int32_t *)nram_w_low, 1, - num_deal_grid); - - __bang_transpose((int32_t *)nram_h_low_ptr_offset, (int32_t *)nram_h_low, - num_per_time_real * num_heads * num_levels, num_points); - __bang_cycle_mul((int32_t *)nram_h_low_ptr_offset, - (int32_t *)nram_h_low_ptr_offset, (int32_t *)nram_h_stride, - num_deal_grid, num_levels); - __bang_cycle_add((int32_t *)nram_h_high_ptr_offset, - (int32_t *)nram_h_low_ptr_offset, (int32_t *)nram_h_stride, - num_deal_grid, num_levels); - - __bang_transpose((int32_t *)nram_w_low_ptr_offset, - (int32_t *)nram_h_low_ptr_offset, num_points, - num_per_time_real * num_heads * num_levels); - - __memcpy_async((int32_t *)nram_h_low_ptr_offset, - (int32_t *)nram_w_low_ptr_offset, - num_deal_grid * sizeof(int32_t), NRAM2NRAM); - __bang_transpose((int32_t *)nram_w_low_ptr_offset, - (int32_t *)nram_h_high_ptr_offset, num_points, - num_per_time_real * num_heads * num_levels); - __memcpy_async((int32_t *)nram_h_high_ptr_offset, - (int32_t *)nram_w_low_ptr_offset, - num_deal_grid * sizeof(int32_t), NRAM2NRAM); - __bang_mul_scalar((int32_t *)nram_w_low_ptr_offset, (int32_t *)nram_w_low, - qid_stride, num_deal_grid); - __bang_add_scalar((int32_t *)nram_w_high_ptr_offset, - (int32_t *)nram_w_low_ptr_offset, qid_stride, - num_deal_grid); - - __bang_add((int32_t *)nram_offset1, (int32_t *)nram_h_low_ptr_offset, - (int32_t *)nram_w_low_ptr_offset, num_deal_grid); - - __bang_transpose((int32_t *)nram_offset_temp, (int32_t *)nram_offset1, - num_per_time_real * num_heads, num_levels * num_points); - __bang_cycle_add((int32_t *)nram_offset_temp, (int32_t *)nram_offset_temp, - (int32_t *)nram_base_ptr, num_deal_grid, num_heads); - - __bang_transpose((int32_t *)nram_offset1, (int32_t *)nram_offset_temp, - num_levels * num_points, num_per_time_real * num_heads); - - __bang_add((int32_t *)nram_offset2, (int32_t *)nram_h_low_ptr_offset, - (int32_t *)nram_w_high_ptr_offset, num_deal_grid); - __bang_transpose((int32_t *)nram_offset_temp, (int32_t *)nram_offset2, - num_per_time_real * num_heads, num_levels * num_points); - __bang_cycle_add((int32_t *)nram_offset_temp, (int32_t *)nram_offset_temp, - (int32_t *)nram_base_ptr, num_deal_grid, num_heads); - __bang_transpose((int32_t *)nram_offset2, (int32_t *)nram_offset_temp, - num_levels * num_points, num_per_time_real * num_heads); - - __bang_add((int32_t *)nram_offset3, (int32_t *)nram_h_high_ptr_offset, - (int32_t *)nram_w_low_ptr_offset, num_deal_grid); - __bang_transpose((int32_t *)nram_offset_temp, (int32_t *)nram_offset3, - num_per_time_real * num_heads, num_levels * num_points); - __bang_cycle_add((int32_t *)nram_offset_temp, (int32_t *)nram_offset_temp, - (int32_t *)nram_base_ptr, num_deal_grid, num_heads); - __bang_transpose((int32_t *)nram_offset3, (int32_t *)nram_offset_temp, - num_levels * num_points, num_per_time_real * num_heads); - __bang_add((int32_t *)nram_offset4, (int32_t *)nram_h_high_ptr_offset, - (int32_t *)nram_w_high_ptr_offset, num_deal_grid); - __bang_transpose((int32_t *)nram_offset_temp, (int32_t *)nram_offset4, - num_per_time_real * num_heads, num_levels * num_points); - __bang_cycle_add((int32_t *)nram_offset_temp, (int32_t *)nram_offset_temp, - (int32_t *)nram_base_ptr, num_deal_grid, num_heads); - __bang_transpose((int32_t *)nram_offset4, (int32_t *)nram_offset_temp, - num_levels * num_points, num_per_time_real * num_heads); - - // h_low >= 0 && w_low >= 0 mask2 - float *mask1 = nram_h_low_ptr_offset; - float *mask2 = nram_h_high_ptr_offset; - float *mask3 = nram_w_low_ptr_offset; - float *mask4 = nram_w_high_ptr_offset; - - __bang_int322float(nram_w_low, (int32_t *)nram_w_low, num_deal_grid, 0); - __bang_int322float(nram_h_low, (int32_t *)nram_h_low, num_deal_grid, 0); - - __bang_ge_scalar(mask1, nram_h_low, 0, num_deal_grid); - __bang_ge_scalar(mask2, nram_w_low, 0, num_deal_grid); - __bang_and(mask2, mask1, mask2, num_deal_grid); - - __bang_and(mask2, nram_h_high_temp, mask2, num_deal_grid); - - // h_low >= 0 && w_high <= width - 1 mask1 - __bang_int322float(nram_w_high, (int32_t *)nram_w_high, num_deal_grid, 0); - - __bang_transpose(mask3, nram_w_high, - num_per_time_real * num_heads * num_levels, num_points); - - __bang_sub_scalar((float *)nram_spatial_shapes, (float *)nram_spatial_shapes, - 1, num_levels * 2); - - __bang_cycle_le((float *)mask3, (float *)mask3, - (float *)(nram_spatial_shapes + num_levels), num_deal_grid, - num_levels); - __bang_transpose(mask4, mask3, num_points, - num_per_time_real * num_heads * num_levels); - __bang_and(mask1, mask1, mask4, num_deal_grid); - __bang_and(mask1, nram_h_high_temp, mask1, num_deal_grid); - - // h_high <= height - 1 && w_high <= width - 1 mask3 - __bang_int322float(nram_h_high, (int32_t *)nram_h_high, num_deal_grid, 0); - __bang_transpose(mask3, nram_h_high, - num_per_time_real * num_heads * num_levels, num_points); - - __bang_cycle_le((float *)mask3, (float *)mask3, - (float *)(nram_spatial_shapes), num_deal_grid, num_levels); - - __bang_transpose(nram_h_low_temp, mask3, num_points, - num_per_time_real * num_heads * num_levels); - __bang_and(mask4, mask4, nram_h_low_temp, num_deal_grid); - __bang_and(mask3, mask4, nram_h_high_temp, num_deal_grid); - - // h_high <= height - 1 && w_low >= 0 mask4 - __bang_ge_scalar(nram_w_low_temp, nram_w_low, 0, num_deal_grid); - __bang_and(mask4, nram_h_low_temp, nram_w_low_temp, num_deal_grid); - __bang_and(mask4, mask4, nram_h_high_temp, num_deal_grid); - __bang_int322float(nram_offset1, (int32_t *)nram_offset1, num_deal_grid, 0); - __bang_gt_scalar(grad_temp1, nram_offset1, 0, num_deal_grid); - - __bang_mul(nram_offset1, nram_offset1, grad_temp1, num_deal_grid); - __bang_mul(nram_offset1, nram_offset1, mask2, num_deal_grid); - __bang_float2int32((int32_t *)nram_offset1, nram_offset1, num_deal_grid, 0); - - __bang_int322float((float *)nram_offset2, (int32_t *)nram_offset2, - num_deal_grid, 0); - __bang_gt_scalar((float *)grad_temp1, (float *)nram_offset2, 0, - num_deal_grid); - - __bang_mul(nram_offset2, nram_offset2, grad_temp1, num_deal_grid); - __bang_mul(nram_offset2, nram_offset2, mask1, num_deal_grid); - __bang_float2int32((int32_t *)nram_offset2, nram_offset2, num_deal_grid, 0); - - __bang_int322float((float *)nram_offset3, (int32_t *)nram_offset3, - num_deal_grid, 0); - __bang_gt_scalar((float *)grad_temp1, (float *)nram_offset3, 0, - num_deal_grid); - - __bang_mul(nram_offset3, nram_offset3, grad_temp1, num_deal_grid); - __bang_mul(nram_offset3, nram_offset3, mask4, num_deal_grid); - __bang_float2int32((int32_t *)nram_offset3, nram_offset3, num_deal_grid, 0); - - __bang_int322float((float *)nram_offset4, (int32_t *)nram_offset4, - num_deal_grid, 0); - __bang_gt_scalar((float *)grad_temp1, (float *)nram_offset4, 0, - num_deal_grid); - - __bang_mul(nram_offset4, nram_offset4, grad_temp1, num_deal_grid); - __bang_mul(nram_offset4, nram_offset4, mask3, num_deal_grid); - __bang_float2int32((int32_t *)nram_offset4, nram_offset4, num_deal_grid, 0); - __sync_io_move_compute(); - - __bang_mul(nram_w1, nram_hh, nram_hw, num_deal_grid); - __bang_mul(nram_w2, nram_hh, nram_lw, num_deal_grid); - __bang_mul(nram_w3, nram_lh, nram_hw, num_deal_grid); - __bang_mul(nram_w4, nram_lh, nram_lw, num_deal_grid); -#endif -} - -void __mlu_func__ loadValue( - float *nram_grad_output_tl, float *nram_grad_output_tr, - float *nram_grad_output_bl, float *nram_grad_output_br, - const float *data_value, float *grad_temp1, float *grad_temp3, float *mask1, - float *mask2, float *mask3, float *mask4, float *nram_offset1, - float *nram_offset2, float *nram_offset3, float *nram_offset4, - float *nram_grad_weight, int32_t *nram_level_start_index, - const int32_t &offset_nram, const int32_t &num_heads, - const int32_t &deal_num_real, const int32_t &num_deal_grid, - const int32_t &num_query, const int32_t &num_levels, - const int32_t &num_points, const int32_t &grid_offset, - const int32_t &spatial_size, const int32_t &qid_stride) { -#if __BANG_ARCH__ >= 372 - int32_t value_offset_temp = 0; - -#if __BANG_ARCH__ >= 592 - for (int i = 0; i < num_deal_grid; ++i) { - int32_t b_col = - (grid_offset + i) / num_query / num_heads / num_levels / num_points; - int32_t l_col = (grid_offset + i) / num_points % num_levels; - int32_t level_start_id = nram_level_start_index[l_col]; - value_offset_temp = - b_col * spatial_size * qid_stride + level_start_id * qid_stride; - ((int32_t *)grad_temp1)[i] = value_offset_temp; - } - - __bang_add((int32_t *)grad_temp3, (int32_t *)grad_temp1, - (int32_t *)nram_offset1, num_deal_grid); - __bang_add((int32_t *)(grad_temp3 + num_deal_grid), (int32_t *)grad_temp1, - (int32_t *)nram_offset2, num_deal_grid); - __bang_add((int32_t *)(grad_temp3 + 2 * num_deal_grid), (int32_t *)grad_temp1, - (int32_t *)nram_offset3, num_deal_grid); - __bang_add((int32_t *)(grad_temp3 + 3 * num_deal_grid), (int32_t *)grad_temp1, - (int32_t *)nram_offset4, num_deal_grid); - __bang_mul_scalar((int32_t *)grad_temp3, (int32_t *)grad_temp3, - sizeof(int32_t), 4 * num_deal_grid); - __sync_io_move_compute(); - - __gather_async((void *)nram_grad_output_tl, (void *)data_value, - (unsigned int *)grad_temp3, deal_num_real * sizeof(float), - GDRAM2NRAM, deal_num_real * sizeof(float), num_deal_grid); - - __gather_async((void *)nram_grad_output_tr, (void *)data_value, - (unsigned int *)(grad_temp3 + num_deal_grid), - deal_num_real * sizeof(float), GDRAM2NRAM, - deal_num_real * sizeof(float), num_deal_grid); - - __gather_async((void *)nram_grad_output_bl, (void *)data_value, - (unsigned int *)(grad_temp3 + 2 * num_deal_grid), - deal_num_real * sizeof(float), GDRAM2NRAM, - deal_num_real * sizeof(float), num_deal_grid); - - __gather_async((void *)nram_grad_output_br, (void *)data_value, - (unsigned int *)(grad_temp3 + 3 * num_deal_grid), - deal_num_real * sizeof(float), GDRAM2NRAM, - deal_num_real * sizeof(float), num_deal_grid); - __sync_io_move_compute(); - -#else - int32_t b_col = - (grid_offset) / num_query / num_heads / num_levels / num_points; - int32_t l_col = (grid_offset) / num_points % num_levels; - int32_t level_start_id = nram_level_start_index[l_col]; - value_offset_temp = - b_col * spatial_size * qid_stride + level_start_id * qid_stride; - for (int32_t loop = 0; loop < num_deal_grid; ++loop) { - __memcpy_async((void *)(nram_grad_output_tl + loop * deal_num_real), - (void *)(data_value + value_offset_temp + - (((int32_t *)nram_offset1)[loop])), - deal_num_real * sizeof(float), GDRAM2NRAM, - offset_nram * sizeof(float), - ((((int32_t *)nram_offset2)[loop]) - - (((int32_t *)nram_offset1)[loop])) * - sizeof(float), - mask1[loop]); - b_col = (grid_offset + loop + 1) / num_query / num_heads / num_levels / - num_points; - l_col = (grid_offset + loop + 1) / num_points % num_levels; - level_start_id = nram_level_start_index[l_col]; - - __memcpy_async((void *)(nram_grad_output_bl + loop * deal_num_real), - (void *)(data_value + value_offset_temp + - (((int32_t *)nram_offset3)[loop])), - deal_num_real * sizeof(float), GDRAM2NRAM, - offset_nram * sizeof(float), - ((((int32_t *)nram_offset4)[loop]) - - (((int32_t *)nram_offset3)[loop])) * - sizeof(float), - mask3[loop]); - value_offset_temp = - b_col * spatial_size * qid_stride + level_start_id * qid_stride; - } -#endif - __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid); - __bang_cycle_add(grad_temp1, grad_temp1, mask2, deal_num_real * num_deal_grid, - num_deal_grid); - __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid); - __nram__ int32_t table[64] = {0, (int32_t)0xffffffff}; - __bang_float2int32((int32_t *)grad_temp3, grad_temp3, - num_deal_grid * deal_num_real, 0); - __bang_lut_s32((int32_t *)grad_temp3, (int32_t *)grad_temp3, (int32_t *)table, - num_deal_grid * deal_num_real, 64); - __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid); - __bang_cycle_add(grad_temp1, grad_temp1, mask1, deal_num_real * num_deal_grid, - num_deal_grid); - __sync_io_move_compute(); - - __bang_band((char *)nram_grad_output_tl, (char *)nram_grad_output_tl, - (char *)grad_temp3, - num_deal_grid * deal_num_real * sizeof(float)); - __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid); - - __bang_float2int32((int32_t *)grad_temp3, grad_temp3, - num_deal_grid * deal_num_real, 0); - __bang_lut_s32((int32_t *)grad_temp3, (int32_t *)grad_temp3, (int32_t *)table, - num_deal_grid * deal_num_real, 64); - __bang_band((char *)nram_grad_output_tr, (char *)nram_grad_output_tr, - (char *)grad_temp3, - num_deal_grid * deal_num_real * sizeof(float)); - - __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid); - __bang_cycle_add(grad_temp1, grad_temp1, mask4, deal_num_real * num_deal_grid, - num_deal_grid); - __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid); - - __bang_float2int32((int32_t *)grad_temp3, grad_temp3, - num_deal_grid * deal_num_real, 0); - __bang_lut_s32((int32_t *)grad_temp3, (int32_t *)grad_temp3, (int32_t *)table, - num_deal_grid * deal_num_real, 64); - __bang_band((char *)nram_grad_output_bl, (char *)nram_grad_output_bl, - (char *)grad_temp3, - num_deal_grid * deal_num_real * sizeof(float)); - - __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid); - __bang_cycle_add(grad_temp1, grad_temp1, mask3, deal_num_real * num_deal_grid, - num_deal_grid); - __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid); - - __bang_float2int32((int32_t *)grad_temp3, grad_temp3, - num_deal_grid * deal_num_real, 0); - __bang_lut_s32((int32_t *)grad_temp3, (int32_t *)grad_temp3, (int32_t *)table, - num_deal_grid * deal_num_real, 64); - __bang_band((char *)nram_grad_output_br, (char *)nram_grad_output_br, - (char *)grad_temp3, - num_deal_grid * deal_num_real * sizeof(float)); -#endif -} - -void __mlu_func__ computeGradValue( - float *grad_temp1, float *grad_temp2, float *grad_temp3, float *grad_temp4, - float *mask1, float *mask2, float *mask3, float *mask4, float *nram_offset1, - float *nram_offset2, float *nram_offset3, float *nram_offset4, - int32_t *nram_level_start_index, int32_t deal_num_real, - const float *grad_value, float *nram_w1, float *nram_w2, float *nram_w3, - float *nram_w4, const int32_t &num_per_time_real, const int32_t &num_heads, - const int32_t &num_levels, const int32_t &num_points, - const int32_t &num_query, const int32_t &num_deal_grid, - const int32_t &grid_offset, const int32_t &spatial_size, - const int32_t &qid_stride, float *nram_grid_offset1, - float *nram_grid_offset2, const int32_t &batch, float *nram_grad_output_tl, - float *nram_grad_output_tr, float *nram_grad_output_bl, - float *nram_grad_output_br, float *nram_grad_weight) { -#if __BANG_ARCH__ >= 372 - __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid); - __bang_cycle_add(grad_temp1, grad_temp1, nram_grad_weight, - deal_num_real * num_deal_grid, num_deal_grid); - __bang_transpose(grad_temp3, grad_temp1, - deal_num_real * num_per_time_real * num_heads, - num_levels * num_points); - __bang_transpose(grad_temp1, grad_temp2, num_per_time_real * num_heads, - deal_num_real); - __bang_cycle_mul(grad_temp3, grad_temp3, grad_temp1, - num_deal_grid * deal_num_real, - deal_num_real * num_per_time_real * num_heads); - __bang_transpose(grad_temp4, grad_temp3, num_levels * num_points, - deal_num_real * num_per_time_real * num_heads); - - int32_t temp_res = num_query * num_heads * num_levels * num_points; - for (int32_t loop = 0; loop < num_deal_grid; ++loop) { - ((int32_t *)nram_grid_offset1)[loop] = ((loop + grid_offset) / temp_res); - } - __bang_mul_scalar((int32_t *)nram_grid_offset1, (int32_t *)nram_grid_offset1, - spatial_size * qid_stride, num_deal_grid); - __bang_transpose((int32_t *)nram_grid_offset2, (int32_t *)nram_grid_offset1, - num_per_time_real * num_heads * num_levels, num_points); - - __bang_mul_scalar((int32_t *)nram_grid_offset1, - (int32_t *)nram_level_start_index, qid_stride, num_levels); - __bang_cycle_add((int32_t *)nram_grid_offset2, (int32_t *)nram_grid_offset2, - (int32_t *)nram_grid_offset1, num_deal_grid, num_levels); - __bang_transpose((int32_t *)nram_grid_offset1, (int32_t *)nram_grid_offset2, - num_points, num_per_time_real * num_heads * num_levels); - - __bang_add((int32_t *)nram_offset1, (int32_t *)nram_offset1, - (int32_t *)nram_grid_offset1, num_deal_grid); - __bang_add((int32_t *)nram_offset2, (int32_t *)nram_offset2, - (int32_t *)nram_grid_offset1, num_deal_grid); - __bang_add((int32_t *)nram_offset3, (int32_t *)nram_offset3, - (int32_t *)nram_grid_offset1, num_deal_grid); - __bang_add((int32_t *)nram_offset4, (int32_t *)nram_offset4, - (int32_t *)nram_grid_offset1, num_deal_grid); - -#if __BANG_ARCH__ >= 592 - // make sure offset not great than (batch * spatial_size * num_heads * - // channels) - __bang_lt_scalar((int32_t *)grad_temp1, (int32_t *)nram_offset1, - batch * spatial_size * num_heads * deal_num_real, - num_deal_grid); - __bang_mul((int32_t *)nram_offset1, (int32_t *)nram_offset1, - (int32_t *)grad_temp1, num_deal_grid); - __bang_lt_scalar((int32_t *)grad_temp1, (int32_t *)nram_offset2, - batch * spatial_size * num_heads * deal_num_real, - num_deal_grid); - __bang_mul((int32_t *)nram_offset2, (int32_t *)nram_offset2, - (int32_t *)grad_temp1, num_deal_grid); - __bang_lt_scalar((int32_t *)grad_temp1, (int32_t *)nram_offset3, - batch * spatial_size * num_heads * deal_num_real, - num_deal_grid); - __bang_mul((int32_t *)nram_offset3, (int32_t *)nram_offset3, - (int32_t *)grad_temp1, num_deal_grid); - __bang_lt_scalar((int32_t *)grad_temp1, (int32_t *)nram_offset4, - batch * spatial_size * num_heads * deal_num_real, - num_deal_grid); - __bang_mul((int32_t *)nram_offset4, (int32_t *)nram_offset4, - (int32_t *)grad_temp1, num_deal_grid); - __bang_mul(grad_temp3, nram_w1, mask2, num_deal_grid); - __bang_cycle_mul(grad_temp1, grad_temp4, grad_temp3, - num_deal_grid * deal_num_real, num_deal_grid); - __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid); - - for (int32_t loop = 0; loop < num_deal_grid; ++loop) { - __bang_atomic_reduce_add( - (float *)(grad_value + ((int32_t *)nram_offset1)[loop]), - (float *)(grad_temp3 + loop * deal_num_real), deal_num_real); - } - - __bang_mul(grad_temp3, nram_w2, mask1, num_deal_grid); - __bang_cycle_mul(grad_temp1, grad_temp4, grad_temp3, - num_deal_grid * deal_num_real, num_deal_grid); - __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid); - - for (int32_t loop = 0; loop < num_deal_grid; ++loop) { - __bang_atomic_reduce_add( - (float *)(grad_value + +((int32_t *)nram_offset2)[loop]), - (float *)(grad_temp3 + loop * deal_num_real), deal_num_real); - } - - __bang_mul(grad_temp3, nram_w3, mask4, num_deal_grid); - __bang_cycle_mul(grad_temp1, grad_temp4, grad_temp3, - num_deal_grid * deal_num_real, num_deal_grid); - - __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid); - - for (int32_t loop = 0; loop < num_deal_grid; ++loop) { - __bang_atomic_reduce_add( - (float *)(grad_value + ((int32_t *)nram_offset3)[loop]), - (float *)(grad_temp3 + loop * deal_num_real), deal_num_real); - } - __bang_mul(grad_temp3, nram_w4, mask3, num_deal_grid); - __bang_cycle_mul(grad_temp1, grad_temp4, grad_temp3, - num_deal_grid * deal_num_real, num_deal_grid); - __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid); - - for (int32_t loop = 0; loop < num_deal_grid; ++loop) { - __bang_atomic_reduce_add( - (float *)(grad_value + ((int32_t *)nram_offset4)[loop]), - (float *)(grad_temp3 + loop * deal_num_real), deal_num_real); - } -#else - __bang_cycle_mul(grad_temp1, grad_temp4, nram_w1, - num_deal_grid * deal_num_real, num_deal_grid); - __bang_transpose(nram_grad_output_br, grad_temp1, deal_num_real, - num_deal_grid); - - __bang_cycle_mul(grad_temp1, grad_temp4, nram_w2, - num_deal_grid * deal_num_real, num_deal_grid); - __bang_transpose(nram_grad_output_tl, grad_temp1, deal_num_real, - num_deal_grid); - - __bang_cycle_mul(grad_temp1, grad_temp4, nram_w3, - num_deal_grid * deal_num_real, num_deal_grid); - __bang_transpose(nram_grad_output_tr, grad_temp1, deal_num_real, - num_deal_grid); - - __bang_cycle_mul(grad_temp1, grad_temp4, nram_w4, - num_deal_grid * deal_num_real, num_deal_grid); - __bang_transpose(nram_grad_output_bl, grad_temp1, deal_num_real, - num_deal_grid); - for (int32_t loop = 0; loop < num_deal_grid; ++loop) { - if (mask2[loop]) { - __bang_atomic_reduce_add( - (float *)(grad_value + int32_t(((int32_t *)nram_offset1)[loop])), - (float *)(nram_grad_output_br + loop * deal_num_real), deal_num_real); - } - if (mask1[loop]) { - __bang_atomic_reduce_add( - (float *)(grad_value + ((int32_t *)nram_offset2)[loop]), - (float *)(nram_grad_output_tl + loop * deal_num_real), deal_num_real); - } - if (mask4[loop]) { - __bang_atomic_reduce_add( - (float *)(grad_value + ((int32_t *)nram_offset3)[loop]), - (float *)(nram_grad_output_tr + loop * deal_num_real), deal_num_real); - } - - if (mask3[loop]) { - __bang_atomic_reduce_add( - (float *)(grad_value + ((int32_t *)nram_offset4)[loop]), - (float *)(nram_grad_output_bl + loop * deal_num_real), deal_num_real); - } - } -#endif -#endif -} - -void __mlu_func__ computeGradAttnWeight( - float *grad_w_weight, float *grad_weight, float *nram_grad_output_tl, - float *nram_grad_output_tr, float *nram_grad_output_bl, - float *nram_grad_output_br, float *grad_temp2, - const float *grad_attn_weight, float *nram_hw, float *nram_hh, - float *nram_lw, float *nram_lh, float *grad_h_weight, float *nram_w1, - float *nram_w2, float *nram_w3, float *nram_w4, const int32_t &offset_nram, - const int32_t &num_deal_grid, const int32_t &deal_num_real, - const int32_t &num_per_time_real, const int32_t &num_heads, - const int32_t &num_levels, const int32_t &num_points, - const int32_t &grid_offset, float *nram_h_high_temp) { - __bang_write_zero(grad_w_weight, 2 * offset_nram); - // grad_output_nram_tl -#if __BANG_ARCH__ >= 372 - __bang_transpose(grad_weight, nram_grad_output_tl, num_deal_grid, - deal_num_real); - __bang_cycle_mul(nram_grad_output_tl, grad_weight, nram_hw, - num_deal_grid * deal_num_real, num_deal_grid); - __bang_sub(grad_h_weight, grad_h_weight, nram_grad_output_tl, - num_deal_grid * deal_num_real); - __bang_cycle_mul(nram_grad_output_tl, grad_weight, nram_hh, - num_deal_grid * deal_num_real, num_deal_grid); - __bang_sub(grad_w_weight, grad_w_weight, nram_grad_output_tl, - num_deal_grid * deal_num_real); - __bang_cycle_mul(nram_grad_output_tl, grad_weight, nram_w1, - num_deal_grid * deal_num_real, num_deal_grid); - // nram_grad_output_tr - __bang_transpose(grad_weight, nram_grad_output_tr, num_deal_grid, - deal_num_real); - __bang_cycle_mul(nram_grad_output_tr, grad_weight, nram_lw, - num_deal_grid * deal_num_real, num_deal_grid); - __bang_sub(grad_h_weight, grad_h_weight, nram_grad_output_tr, - num_deal_grid * deal_num_real); - __bang_cycle_mul(nram_grad_output_tr, grad_weight, nram_hh, - num_deal_grid * deal_num_real, num_deal_grid); - __bang_add(grad_w_weight, grad_w_weight, nram_grad_output_tr, - num_deal_grid * deal_num_real); - __bang_cycle_mul(nram_grad_output_tr, grad_weight, nram_w2, - num_deal_grid * deal_num_real, num_deal_grid); - __bang_add(nram_grad_output_tl, nram_grad_output_tl, nram_grad_output_tr, - num_deal_grid * deal_num_real); - - // nram_grad_output_tl - __bang_transpose(grad_weight, nram_grad_output_bl, num_deal_grid, - deal_num_real); - __bang_cycle_mul(nram_grad_output_bl, grad_weight, nram_hw, - num_deal_grid * deal_num_real, num_deal_grid); - __bang_add(grad_h_weight, grad_h_weight, nram_grad_output_bl, - num_deal_grid * deal_num_real); - __bang_cycle_mul(nram_grad_output_bl, grad_weight, nram_lh, - num_deal_grid * deal_num_real, num_deal_grid); - __bang_sub(grad_w_weight, grad_w_weight, nram_grad_output_bl, - num_deal_grid * deal_num_real); - __bang_cycle_mul(nram_grad_output_bl, grad_weight, nram_w3, - num_deal_grid * deal_num_real, num_deal_grid); - - __bang_add(nram_grad_output_tl, nram_grad_output_tl, nram_grad_output_bl, - num_deal_grid * deal_num_real); - - // nram_grad_output_br - __bang_transpose(grad_weight, nram_grad_output_br, num_deal_grid, - deal_num_real); - __bang_cycle_mul(nram_grad_output_br, grad_weight, nram_lw, - num_deal_grid * deal_num_real, num_deal_grid); - __bang_add(grad_h_weight, grad_h_weight, nram_grad_output_br, - num_deal_grid * deal_num_real); - __bang_cycle_mul(nram_grad_output_br, grad_weight, nram_lh, - num_deal_grid * deal_num_real, num_deal_grid); - __bang_add(grad_w_weight, grad_w_weight, nram_grad_output_br, - num_deal_grid * deal_num_real); - __bang_cycle_mul(nram_grad_output_br, grad_weight, nram_w4, - num_deal_grid * deal_num_real, num_deal_grid); - __bang_add(nram_grad_output_tl, nram_grad_output_tl, nram_grad_output_br, - num_deal_grid * deal_num_real); - - __bang_transpose(nram_grad_output_br, nram_grad_output_tl, deal_num_real, - num_deal_grid); - __bang_transpose(nram_grad_output_tr, nram_grad_output_br, - num_per_time_real * num_heads, - num_points * num_levels * deal_num_real); - __bang_transpose(grad_weight, grad_temp2, num_per_time_real * num_heads, - deal_num_real); - __bang_cycle_mul(nram_grad_output_tr, nram_grad_output_tr, grad_weight, - num_deal_grid * deal_num_real, - num_per_time_real * num_heads * deal_num_real); - __bang_transpose(nram_grad_output_br, nram_grad_output_tr, - num_points * num_levels * deal_num_real, - num_per_time_real * num_heads); - __bang_transpose((float *)nram_grad_output_tr, (float *)nram_grad_output_br, - num_deal_grid, deal_num_real); - - __mluop_recursive_sum_pool(nram_grad_output_tr, num_deal_grid, deal_num_real, - ALIGN_NUM); - - __bang_float2int32((int32_t *)nram_h_high_temp, nram_h_high_temp, - num_deal_grid, 0); - __nram__ int32_t table[64] = {0, (int32_t)0xffffffff}; - __bang_lut_s32((int32_t *)nram_h_high_temp, (int32_t *)nram_h_high_temp, - (int32_t *)table, num_deal_grid, 64); - __bang_band((char *)nram_grad_output_tr, (char *)nram_grad_output_tr, - (char *)nram_h_high_temp, num_deal_grid * sizeof(float)); - __bang_atomic_reduce_add((float *)grad_attn_weight + grid_offset, - (float *)nram_grad_output_tr, num_deal_grid); -#endif -} - -void __mlu_func__ computeGradSampingLoc( - const float *grad_sampling_loc, float *nram_grad_output_tl, - float *nram_grad_output_tr, float *grad_h_weight, float *grad_w_weight, - int32_t *nram_spatial_shapes, float *grad_temp1, float *grad_temp2, - float *nram_grad_weight, const int32_t &num_deal_grid, - const int32_t &deal_num_real, const int32_t &num_per_time_real, - const int32_t &num_heads, const int32_t &num_levels, - const int32_t &num_points, const int32_t &grid_offset, - float *nram_h_high_temp) { -#if __BANG_ARCH__ >= 372 - __bang_add_scalar((float *)nram_spatial_shapes, (float *)nram_spatial_shapes, - 1.0, 2 * num_levels); - __bang_transpose(nram_grad_output_tl, grad_h_weight, - num_per_time_real * num_heads * num_levels * deal_num_real, - num_points); // pcxhl - __bang_cycle_mul(nram_grad_output_tl, nram_grad_output_tl, - (float *)nram_spatial_shapes, num_deal_grid * deal_num_real, - num_levels); - __bang_transpose(grad_h_weight, nram_grad_output_tl, - num_points * deal_num_real, - num_per_time_real * num_heads * num_levels); - - __bang_write_zero(grad_temp1, num_deal_grid * deal_num_real); - __bang_cycle_add(grad_temp1, grad_temp1, nram_grad_weight, - num_deal_grid * deal_num_real, num_deal_grid); - __bang_transpose(nram_grad_output_tr, grad_temp1, - deal_num_real * num_per_time_real * num_heads, - num_levels * num_points); - __bang_transpose(grad_temp1, grad_temp2, num_per_time_real * num_heads, - deal_num_real); - __bang_cycle_mul(nram_grad_output_tr, nram_grad_output_tr, grad_temp1, - num_deal_grid * deal_num_real, - deal_num_real * num_per_time_real * num_heads); - __bang_transpose(grad_temp1, nram_grad_output_tr, - num_levels * num_points * deal_num_real, - num_per_time_real * num_heads); - - __bang_mul(grad_h_weight, grad_h_weight, grad_temp1, - num_deal_grid * deal_num_real); - __bang_transpose(nram_grad_output_tl, grad_h_weight, num_deal_grid, - deal_num_real); - __memcpy_async(grad_h_weight, nram_grad_output_tl, - num_deal_grid * deal_num_real * sizeof(float), NRAM2NRAM); - __mluop_recursive_sum_pool(grad_h_weight, num_deal_grid, deal_num_real, - ALIGN_NUM); - - __nram__ int32_t table[64] = {0, (int32_t)0xffffffff}; - __bang_lut_s32((int32_t *)nram_h_high_temp, (int32_t *)nram_h_high_temp, - (int32_t *)table, num_deal_grid, 64); - __bang_band((char *)grad_h_weight, (char *)grad_h_weight, - (char *)nram_h_high_temp, num_deal_grid * sizeof(float)); - - __bang_transpose(nram_grad_output_tl, grad_w_weight, - num_per_time_real * num_heads * num_levels * deal_num_real, - num_points); // pcxhl - __bang_cycle_mul(nram_grad_output_tl, nram_grad_output_tl, - (float *)(nram_spatial_shapes + num_levels), - num_deal_grid * deal_num_real, num_levels); - __bang_transpose(grad_w_weight, nram_grad_output_tl, - num_points * deal_num_real, - num_per_time_real * num_heads * num_levels); - - __bang_mul(grad_w_weight, grad_w_weight, grad_temp1, - num_deal_grid * deal_num_real); - __bang_transpose(nram_grad_output_tl, grad_w_weight, num_deal_grid, - deal_num_real); - __memcpy_async(grad_w_weight, nram_grad_output_tl, - num_deal_grid * deal_num_real * sizeof(float), NRAM2NRAM); - __mluop_recursive_sum_pool(grad_w_weight, num_deal_grid, deal_num_real, - ALIGN_NUM); - __bang_lut_s32((int32_t *)nram_h_high_temp, (int32_t *)nram_h_high_temp, - (int32_t *)table, num_deal_grid, 64); - __bang_band((char *)grad_w_weight, (char *)grad_w_weight, - (char *)nram_h_high_temp, num_deal_grid * sizeof(float)); - - __memcpy_async(grad_w_weight + num_deal_grid, grad_h_weight, - num_deal_grid * sizeof(float), NRAM2NRAM); - __bang_transpose(nram_grad_output_tl, grad_w_weight, 2, num_deal_grid); - __bang_atomic_reduce_add((float *)grad_sampling_loc + grid_offset * 2, - (float *)nram_grad_output_tl, 2 * num_deal_grid); -#endif -} - -__mlu_global__ void MLUUnion1KernelMsDeformAttnBackwardSmallChannelsKernel( - const float *data_value, const int32_t *spatial_shapes, - const int32_t *data_level_start_index, const float *data_sampling_loc, - const float *data_attn_weight, const float *grad_output, - const int32_t batch, const int32_t spatial_size, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, const int32_t num_query, - const int32_t num_points, float *grad_value, float *grad_sampling_loc, - float *grad_attn_weight) { - const int32_t split_grid_num = 28; - const int32_t split_num_c = 8; - const int32_t C_align = PAD_UP(channels, ALIGN_NUM); - - const int32_t num_hlp = num_heads * num_levels * num_points; - int32_t num_per_time_theory = - (MAX_NRAM_SIZE - num_levels * sizeof(float) - - 3 * PAD_UP(num_levels, 32) * sizeof(int32_t)) / - sizeof(float) / (split_num_c * C_align + split_grid_num) / (num_hlp); - - int32_t deal_grid_num_theory = num_per_time_theory * num_hlp; - - const int32_t offset_nram = num_per_time_theory * C_align * num_hlp; - const int32_t offset_nram_calc = PAD_UP(deal_grid_num_theory, ALIGN_NUM); - float *nram_grad_output_tl = (float *)nram_buffer; - float *nram_grad_output_tr = (float *)nram_buffer + offset_nram; - float *nram_grad_output_bl = (float *)nram_buffer + 2 * offset_nram; - float *nram_grad_output_br = (float *)nram_buffer + 3 * offset_nram; - - float *grad_temp1 = (float *)nram_buffer + 4 * offset_nram; - float *grad_temp2 = (float *)nram_buffer + 5 * offset_nram; - float *grad_temp3 = (float *)nram_buffer + 6 * offset_nram; - float *grad_temp4 = (float *)nram_buffer + 7 * offset_nram; - - float *nram_loc_w = (float *)nram_buffer + split_num_c * offset_nram; - float *nram_loc_h = - (float *)nram_buffer + split_num_c * offset_nram + offset_nram_calc; - float *nram_h_low = - (float *)nram_buffer + split_num_c * offset_nram + 2 * offset_nram_calc; - float *nram_w_low = - (float *)nram_buffer + split_num_c * offset_nram + 3 * offset_nram_calc; - float *nram_h_high = - (float *)nram_buffer + split_num_c * offset_nram + 4 * offset_nram_calc; - float *nram_w_high = - (float *)nram_buffer + split_num_c * offset_nram + 5 * offset_nram_calc; - float *nram_h_low_temp = - (float *)nram_buffer + split_num_c * offset_nram + 6 * offset_nram_calc; - float *nram_h_high_temp = - (float *)nram_buffer + split_num_c * offset_nram + 7 * offset_nram_calc; - - float *nram_hw = - (float *)nram_buffer + split_num_c * offset_nram + 8 * offset_nram_calc; - float *nram_hh = - (float *)nram_buffer + split_num_c * offset_nram + 9 * offset_nram_calc; - float *nram_lw = - (float *)nram_buffer + split_num_c * offset_nram + 10 * offset_nram_calc; - float *nram_lh = - (float *)nram_buffer + split_num_c * offset_nram + 11 * offset_nram_calc; - - float *nram_h_low_ptr_offset = - (float *)nram_buffer + split_num_c * offset_nram + 12 * offset_nram_calc; - float *nram_h_high_ptr_offset = - (float *)nram_buffer + split_num_c * offset_nram + 13 * offset_nram_calc; - float *nram_w_low_ptr_offset = - (float *)nram_buffer + split_num_c * offset_nram + 14 * offset_nram_calc; - float *nram_w_high_ptr_offset = - (float *)nram_buffer + split_num_c * offset_nram + 15 * offset_nram_calc; - - float *nram_w1 = - (float *)nram_buffer + split_num_c * offset_nram + 16 * offset_nram_calc; - float *nram_w2 = - (float *)nram_buffer + split_num_c * offset_nram + 17 * offset_nram_calc; - float *nram_w3 = - (float *)nram_buffer + split_num_c * offset_nram + 18 * offset_nram_calc; - float *nram_w4 = - (float *)nram_buffer + split_num_c * offset_nram + 19 * offset_nram_calc; - - float *nram_grad_weight = - (float *)nram_buffer + split_num_c * offset_nram + 20 * offset_nram_calc; - float *nram_base_ptr = - (float *)nram_buffer + split_num_c * offset_nram + 21 * offset_nram_calc; - float *nram_offset_temp = - (float *)nram_buffer + split_num_c * offset_nram + 22 * offset_nram_calc; - - float *nram_offset1 = - (float *)nram_buffer + split_num_c * offset_nram + 23 * offset_nram_calc; - float *nram_offset2 = - (float *)nram_buffer + split_num_c * offset_nram + 24 * offset_nram_calc; - float *nram_offset3 = - (float *)nram_buffer + split_num_c * offset_nram + 25 * offset_nram_calc; - float *nram_offset4 = - (float *)nram_buffer + split_num_c * offset_nram + 26 * offset_nram_calc; - - float *nram_w_low_temp = - (float *)nram_buffer + split_num_c * offset_nram + 27 * offset_nram_calc; - int32_t *nram_spatial_shapes = - (int32_t *)((float *)nram_buffer + split_num_c * offset_nram + - 28 * offset_nram_calc); - int32_t *nram_level_start_index = - (int32_t *)(nram_spatial_shapes + 2 * PAD_UP(num_levels, 32)); - float *nram_h_stride = - (float *)(nram_level_start_index + 3 * PAD_UP(num_levels, 32)); - - const int32_t total_num = batch * num_query; - int32_t num_per_core = total_num / taskDim; - int32_t num_rem = total_num % taskDim; - num_per_core = num_per_core + int32_t(taskId < num_rem); - num_per_time_theory = - num_per_core > num_per_time_theory ? num_per_time_theory : num_per_core; - int32_t num_deal_grid = num_per_time_theory * num_hlp; - - if (num_per_core == 0) return; - int32_t start_per_core = num_rem > taskId ? (taskId * num_per_core) - : (num_rem + taskId * num_per_core); - - const int32_t qid_stride = num_heads * channels; - int32_t deal_num_real = channels; - - const int32_t repeat_times = num_per_core / num_per_time_theory; - const int32_t tail_num = num_per_core % num_per_time_theory; - - int32_t num_per_time_real = num_per_time_theory; - - for (int32_t loop = 0; loop < num_heads; ++loop) { - ((int32_t *)nram_base_ptr)[loop] = loop * channels; - } - const int32_t w_stride = num_heads * channels; - for (int32_t grid_loop = 0; grid_loop < repeat_times + 1; ++grid_loop) { - int32_t grid_offset = - (start_per_core + grid_loop * num_per_time_theory) * num_hlp; - if (grid_loop == repeat_times) { - if (tail_num == 0) { - continue; - } else { - grid_offset = - (start_per_core + repeat_times * num_per_time_theory) * num_hlp; - num_per_time_real = tail_num; - num_deal_grid = tail_num * num_hlp; - } - } - __memcpy_async(nram_spatial_shapes, spatial_shapes, - num_levels * 2 * sizeof(int32_t), GDRAM2NRAM); - - __memcpy_async(nram_loc_w, data_sampling_loc + grid_offset * 2, - num_deal_grid * 2 * sizeof(float), GDRAM2NRAM); - - __sync_io_move_compute(); - __memcpy_async(nram_grad_weight, data_attn_weight + grid_offset, - num_deal_grid * sizeof(float), GDRAM2NRAM); - __memcpy_async(nram_level_start_index, data_level_start_index, - num_levels * sizeof(int32_t), GDRAM2NRAM); - computeGridMaskAndOffset( - nram_grad_output_tl, nram_grad_output_tr, nram_loc_w, nram_loc_h, - nram_h_stride, nram_spatial_shapes, nram_w_low_temp, nram_h_high_temp, - nram_w_low, nram_h_low, nram_h_high, nram_w_high, nram_lh, nram_lw, - nram_hh, nram_hw, nram_h_low_ptr_offset, nram_h_high_ptr_offset, - nram_w_low_ptr_offset, nram_w_high_ptr_offset, nram_w1, nram_w2, - nram_w3, nram_w4, nram_offset_temp, nram_offset1, nram_offset2, - nram_offset3, nram_offset4, nram_base_ptr, nram_h_low_temp, - num_deal_grid, num_per_time_real, num_heads, num_levels, num_points, - w_stride, qid_stride, grad_temp1); - float *mask1 = nram_h_low_ptr_offset; - float *mask2 = nram_h_high_ptr_offset; - float *mask3 = nram_w_low_ptr_offset; - float *mask4 = nram_w_high_ptr_offset; - __memcpy_async( - grad_temp2, - grad_output + (start_per_core + grid_loop * num_per_time_theory) * - num_heads * deal_num_real, - num_per_time_real * num_heads * deal_num_real * sizeof(float), - GDRAM2NRAM); - loadValue(nram_grad_output_tl, nram_grad_output_tr, nram_grad_output_bl, - nram_grad_output_br, data_value, grad_temp1, grad_temp3, mask1, - mask2, mask3, mask4, nram_offset1, nram_offset2, nram_offset3, - nram_offset4, nram_grad_weight, nram_level_start_index, - offset_nram, num_heads, deal_num_real, num_deal_grid, num_query, - num_levels, num_points, grid_offset, spatial_size, qid_stride); - - // compute grad_weight - float *grad_weight = grad_temp1; - float *grad_h_weight = grad_temp4; - float *grad_w_weight = grad_temp3; - computeGradAttnWeight( - grad_w_weight, grad_weight, nram_grad_output_tl, nram_grad_output_tr, - nram_grad_output_bl, nram_grad_output_br, grad_temp2, grad_attn_weight, - nram_hw, nram_hh, nram_lw, nram_lh, grad_h_weight, nram_w1, nram_w2, - nram_w3, nram_w4, offset_nram, num_deal_grid, deal_num_real, - num_per_time_real, num_heads, num_levels, num_points, grid_offset, - nram_h_high_temp); - - // compute grad_sampling_loc - computeGradSampingLoc(grad_sampling_loc, nram_grad_output_tl, - nram_grad_output_tr, grad_h_weight, grad_w_weight, - nram_spatial_shapes, grad_temp1, grad_temp2, - nram_grad_weight, num_deal_grid, deal_num_real, - num_per_time_real, num_heads, num_levels, num_points, - grid_offset, nram_h_high_temp); - - float *nram_grid_offset1 = nram_loc_h; - float *nram_grid_offset2 = nram_loc_w; - computeGradValue( - grad_temp1, grad_temp2, grad_temp3, grad_temp4, mask1, mask2, mask3, - mask4, nram_offset1, nram_offset2, nram_offset3, nram_offset4, - nram_level_start_index, deal_num_real, grad_value, nram_w1, nram_w2, - nram_w3, nram_w4, num_per_time_real, num_heads, num_levels, num_points, - num_query, num_deal_grid, grid_offset, spatial_size, qid_stride, - nram_grid_offset1, nram_grid_offset2, batch, nram_grad_output_tl, - nram_grad_output_tr, nram_grad_output_bl, nram_grad_output_br, - nram_grad_weight); - } -} - -mluOpStatus_t MLUOP_WIN_API KernelMsDeformAttnBackwardSmallChannels( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - const float *data_value, const int32_t *spatial_shapes, - const int32_t *data_level_start_index, const float *data_sampling_loc, - const float *data_attn_weight, const float *grad_output, - const int32_t batch, const int32_t spatial_size, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, const int32_t num_query, - const int32_t num_points, float *grad_value, float *grad_sampling_loc, - float *grad_attn_weight) { - KERNEL_CHECK( - MLUUnion1KernelMsDeformAttnBackwardSmallChannelsKernel<<>>( - data_value, spatial_shapes, data_level_start_index, data_sampling_loc, - data_attn_weight, grad_output, batch, spatial_size, num_heads, - channels, num_levels, num_query, num_points, grad_value, - grad_sampling_loc, grad_attn_weight)); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/ms_deform_attn_backward/ms_deform_attn_backward_union1.mlu b/kernels/ms_deform_attn_backward/ms_deform_attn_backward_union1.mlu deleted file mode 100644 index d70448a51..000000000 --- a/kernels/ms_deform_attn_backward/ms_deform_attn_backward_union1.mlu +++ /dev/null @@ -1,296 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "ms_deform_attn_backward.h" - -#include "core/logging.h" -#include "kernels/kernel.h" -#include "kernels/utils/common.h" - -__nram__ char nram_buffer[MAX_NRAM_SIZE]; - -#define likely(x) __builtin_expect((x), 1) -#define ALIGN_NUM 64 -#define ALIGN_NUM_FOR_REDUCE 32 -#define LEN_FLOAT sizeof(float) - -template -void __mlu_func__ msDeformAttnCol2imBilinear( - T *top_grad_temp, const int32_t &height, const int32_t &width, const T &w1, - const T &w2, const T &w3, const T &w4, const int32_t &h_low, - const int32_t &w_low, const int32_t &h_high, const int32_t &w_high, - const int32_t &base_ptr, const int32_t &h_low_ptr_offset, - const int32_t &w_low_ptr_offset, const int32_t &h_high_ptr_offset, - const int32_t &w_high_ptr_offset, const T &hh, const T &hw, const T &lh, - const T &lw, T *top_grad, const T &data_attn_weight, T *grad_h_weight, - T *grad_w_weight, T *grad_value, T *grad_output_nram, T *grad_weight, - T *grad_sampling_loc, T *grad_attn_weight, T *grad_output_nram_temp, - const int32_t &deal_num, const int32_t &deal_num_real, - const T *data_value_ptr) { -#if __BANG_ARCH__ >= 372 - if (h_low >= 0 && w_low >= 0) { - int32_t offset1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; - __memcpy(grad_output_nram, data_value_ptr + offset1, - deal_num_real * sizeof(T), GDRAM2NRAM); - __bang_mul_scalar(grad_weight, grad_output_nram, hw, deal_num_real); - __bang_sub(grad_h_weight, grad_h_weight, grad_weight, deal_num_real); - __bang_mul_scalar(grad_weight, grad_output_nram, hh, deal_num_real); - __bang_sub(grad_w_weight, grad_w_weight, grad_weight, deal_num_real); - - __bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num_real); - __bang_mul_scalar(top_grad_temp, top_grad_temp, w1, deal_num_real); - // for calc grad_attn_weight - __bang_mul_scalar(grad_output_nram, grad_output_nram, w1, deal_num_real); - __bang_atomic_reduce_add((T *)(grad_value + offset1), (T *)top_grad_temp, - deal_num_real); - } - if (h_low >= 0 && w_high <= width - 1) { - int32_t offset2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; - __memcpy(grad_output_nram_temp, data_value_ptr + offset2, - deal_num_real * sizeof(T), GDRAM2NRAM); - __bang_mul_scalar(grad_weight, grad_output_nram_temp, lw, deal_num_real); - __bang_sub(grad_h_weight, grad_h_weight, grad_weight, deal_num_real); - __bang_mul_scalar(grad_weight, grad_output_nram_temp, hh, deal_num_real); - __bang_add(grad_w_weight, grad_w_weight, grad_weight, deal_num_real); - - __bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num_real); - __bang_mul_scalar(top_grad_temp, top_grad_temp, w2, deal_num_real); - - __bang_mul_scalar(grad_output_nram_temp, grad_output_nram_temp, w2, - deal_num_real); - __bang_add(grad_output_nram, grad_output_nram, grad_output_nram_temp, - deal_num_real); - __bang_atomic_reduce_add((T *)(grad_value + offset2), (T *)top_grad_temp, - deal_num_real); - } - if (h_high <= height - 1 && w_low >= 0) { - int32_t offset3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; - __memcpy(grad_output_nram_temp, data_value_ptr + offset3, - deal_num_real * sizeof(T), GDRAM2NRAM); - __bang_mul_scalar(grad_weight, grad_output_nram_temp, hw, deal_num_real); - __bang_add(grad_h_weight, grad_h_weight, grad_weight, deal_num_real); - __bang_mul_scalar(grad_weight, grad_output_nram_temp, lh, deal_num_real); - __bang_sub(grad_w_weight, grad_w_weight, grad_weight, deal_num_real); - - __bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num_real); - __bang_mul_scalar(top_grad_temp, top_grad_temp, w3, deal_num_real); - // for calc grad_attn_weight - __bang_mul_scalar(grad_output_nram_temp, grad_output_nram_temp, w3, - deal_num_real); - __bang_add(grad_output_nram, grad_output_nram, grad_output_nram_temp, - deal_num_real); - __bang_atomic_reduce_add((T *)(grad_value + offset3), (T *)top_grad_temp, - deal_num_real); - } - if (h_high <= height - 1 && w_high <= width - 1) { - int32_t offset4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; - __memcpy(grad_output_nram_temp, data_value_ptr + offset4, - deal_num_real * sizeof(T), GDRAM2NRAM); - __bang_mul_scalar(grad_weight, grad_output_nram_temp, lw, deal_num_real); - __bang_add(grad_h_weight, grad_h_weight, grad_weight, deal_num_real); - __bang_mul_scalar(grad_weight, grad_output_nram_temp, lh, deal_num_real); - __bang_add(grad_w_weight, grad_w_weight, grad_weight, deal_num_real); - - __bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num_real); - __bang_mul_scalar(top_grad_temp, top_grad_temp, w4, deal_num_real); - // for calc grad_attn_weight - __bang_mul_scalar(grad_output_nram_temp, grad_output_nram_temp, w4, - deal_num_real); - __bang_add(grad_output_nram, grad_output_nram, grad_output_nram_temp, - deal_num_real); - - __bang_atomic_reduce_add((T *)(grad_value + offset4), (T *)top_grad_temp, - deal_num_real); - } - __bang_mul(grad_output_nram, grad_output_nram, top_grad, deal_num_real); - __mluop_recursive_sum_pool(grad_output_nram, 1, deal_num_real, - ALIGN_NUM_FOR_REDUCE); - __bang_atomic_reduce_add((T *)grad_attn_weight, (T *)grad_output_nram, 1); - __bang_mul_scalar(grad_w_weight, grad_w_weight, width, deal_num_real); - __bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num_real); - __bang_mul(grad_w_weight, grad_w_weight, top_grad_temp, deal_num_real); - __mluop_recursive_sum_pool(grad_w_weight, 1, deal_num_real, - ALIGN_NUM_FOR_REDUCE); - __bang_atomic_reduce_add((T *)(grad_sampling_loc), (T *)grad_w_weight, 1); - - __bang_mul_scalar(grad_h_weight, grad_h_weight, height, deal_num_real); - __bang_mul(grad_h_weight, grad_h_weight, top_grad_temp, deal_num_real); - __mluop_recursive_sum_pool(grad_h_weight, 1, deal_num_real, - ALIGN_NUM_FOR_REDUCE); - __bang_atomic_reduce_add((T *)(grad_sampling_loc + 1), (T *)grad_h_weight, 1); -#endif -} - -__mlu_global__ void MLUUnion1KernelMsDeformAttnBackwardDefault( - const float *data_value, const int32_t *spatial_shapes, - const int32_t *data_level_start_index, const float *data_sampling_loc, - const float *data_attn_weight, const float *grad_output, - const int32_t batch, const int32_t spatial_size, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, const int32_t num_query, - const int32_t num_points, float *grad_value, float *grad_sampling_loc, - float *grad_attn_weight) { -#if __BANG_ARCH__ != 520 - if (__is_mpu()) { - return; - } - const int32_t split_num = 8; - const int32_t spatial_shapes_size = 64; - - const int32_t deal_num = PAD_DOWN( - (MAX_NRAM_SIZE - spatial_shapes_size) / split_num / LEN_FLOAT, ALIGN_NUM); - float *grad_output_nram = (float *)nram_buffer; - float *grad_output_nram_temp = (float *)nram_buffer + deal_num; - float *grad_weight = (float *)nram_buffer + 2 * deal_num; - float *grad_h_weight = (float *)nram_buffer + 3 * deal_num; - float *grad_w_weight = (float *)nram_buffer + 4 * deal_num; - float *top_grad = (float *)nram_buffer + 5 * deal_num; - float *top_grad_temp = (float *)nram_buffer + 6 * deal_num; - int32_t *spatial_shapes_nram = - (int32_t *)((float *)nram_buffer + 7 * deal_num); - float *sampling_loc_nram = - (float *)nram_buffer + 7 * deal_num + 2 * sizeof(int32_t); - const int32_t total_num = batch * num_query * num_heads * num_levels; - int32_t num_per_core = total_num / taskDim; - int32_t num_rem = total_num % taskDim; - num_per_core = num_per_core + int32_t(taskId < num_rem); - int32_t start_per_core = - num_rem > taskId - ? (taskId * num_per_core) - : ((num_per_core + 1) * num_rem + (taskId - num_rem) * num_per_core); - int32_t end_per_core = start_per_core + num_per_core; - const int32_t C_repeat = channels / deal_num; - const int32_t C_tail = channels % deal_num; - const int32_t qid_stride = num_heads * channels; - for (int32_t num_loop = start_per_core; num_loop < end_per_core; ++num_loop) { - const int32_t l_col = num_loop % num_levels; - const int32_t m_col = num_loop / num_levels % num_heads; - const int32_t q_col = num_loop / num_levels / num_heads % num_query; - const int32_t b_col = num_loop / num_query / num_heads / num_levels; - int32_t data_weight_ptr = num_loop * num_points; - int32_t data_loc_w_ptr = data_weight_ptr << 1; - const int32_t value_offset = b_col * spatial_size * qid_stride; - const int32_t level_start_id = data_level_start_index[l_col]; - const int32_t grad_attn_weight_out = num_loop * num_points; - int32_t spatial_h_ptr = l_col << 1; - int32_t grad_output_offset = - b_col * num_query * qid_stride + q_col * qid_stride + m_col * channels; - __memcpy(spatial_shapes_nram, spatial_shapes + spatial_h_ptr, - 2 * sizeof(int32_t), GDRAM2NRAM); - const int32_t spatial_h = spatial_shapes_nram[0]; - const int32_t spatial_w = spatial_shapes_nram[1]; - const int32_t h_stride = spatial_w * qid_stride; - const int32_t value_ptr_offset = value_offset + level_start_id * qid_stride; - const float *data_value_ptr = data_value + value_ptr_offset; - float *grad_value_ptr = grad_value + value_ptr_offset; - - const int32_t grad_sampling_loc_out = num_loop * num_points << 1; - for (int32_t p_col = 0; p_col < num_points; ++p_col) { - __memcpy(sampling_loc_nram, data_sampling_loc + data_loc_w_ptr, - (LEN_FLOAT << 1), GDRAM2NRAM); - const float loc_w = sampling_loc_nram[0]; - const float loc_h = sampling_loc_nram[1]; - const float weight = data_attn_weight[data_weight_ptr]; - const float h_im = loc_h * spatial_h - 0.5; - const float w_im = loc_w * spatial_w - 0.5; - if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { - const int32_t h_low = floorf(h_im); - const int32_t w_low = floorf(w_im); - const int32_t h_high = h_low + 1; - const int32_t w_high = w_low + 1; - - const float lh = h_im - h_low; - const float lw = w_im - w_low; - const float hh = 1.0 - lh; - const float hw = 1.0 - lw; - - const int32_t h_low_ptr_offset = h_low * h_stride; - const int32_t h_high_ptr_offset = h_low_ptr_offset + h_stride; - const int32_t w_low_ptr_offset = w_low * qid_stride; - const int32_t w_high_ptr_offset = w_low_ptr_offset + qid_stride; - - const float w1 = hh * hw; - const float w2 = hh * lw; - const float w3 = lh * hw; - const float w4 = lh * lw; - if (likely(C_tail != 0)) { - const int32_t base_ptr = m_col * channels + C_repeat * deal_num; - __bang_write_zero(grad_h_weight, PAD_UP(channels, ALIGN_NUM)); - __bang_write_zero(grad_w_weight, PAD_UP(channels, ALIGN_NUM)); - __bang_write_zero(grad_output_nram, PAD_UP(channels, ALIGN_NUM)); - - __memcpy(top_grad, - grad_output + grad_output_offset + C_repeat * deal_num, - C_tail * LEN_FLOAT, GDRAM2NRAM); - msDeformAttnCol2imBilinear( - top_grad_temp, spatial_h, spatial_w, w1, w2, w3, w4, h_low, w_low, - h_high, w_high, base_ptr, h_low_ptr_offset, w_low_ptr_offset, - h_high_ptr_offset, w_high_ptr_offset, hh, hw, lh, lw, top_grad, - weight, grad_h_weight, grad_w_weight, grad_value_ptr, - grad_output_nram, grad_weight, - grad_sampling_loc + grad_sampling_loc_out + (p_col << 1), - grad_attn_weight + grad_attn_weight_out + p_col, - grad_output_nram_temp, deal_num, C_tail, data_value_ptr); - } - for (int32_t C_loop = 0; C_loop < C_repeat; ++C_loop) { - const int32_t base_ptr = m_col * channels + C_loop * deal_num; - __bang_write_zero(grad_h_weight, PAD_UP(channels, ALIGN_NUM)); - __bang_write_zero(grad_w_weight, PAD_UP(channels, ALIGN_NUM)); - __bang_write_zero(grad_output_nram, PAD_UP(channels, ALIGN_NUM)); - __memcpy(top_grad, - grad_output + grad_output_offset + C_loop * deal_num, - deal_num * LEN_FLOAT, GDRAM2NRAM); - msDeformAttnCol2imBilinear( - top_grad_temp, spatial_h, spatial_w, w1, w2, w3, w4, h_low, w_low, - h_high, w_high, base_ptr, h_low_ptr_offset, w_low_ptr_offset, - h_high_ptr_offset, w_high_ptr_offset, hh, hw, lh, lw, top_grad, - weight, grad_h_weight, grad_w_weight, grad_value_ptr, - grad_output_nram, grad_weight, - grad_sampling_loc + grad_sampling_loc_out + (p_col << 1), - grad_attn_weight + grad_attn_weight_out + p_col, - grad_output_nram_temp, deal_num, deal_num, data_value_ptr); - } - } - data_weight_ptr += 1; - data_loc_w_ptr += 2; - } - } - -#endif -} - -mluOpStatus_t MLUOP_WIN_API KernelMsDeformAttnBackwardDefault( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - const float *data_value, const int32_t *spatial_shapes, - const int32_t *data_level_start_index, const float *data_sampling_loc, - const float *data_attn_weight, const float *grad_output, - const int32_t batch, const int32_t spatial_size, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, const int32_t num_query, - const int32_t num_points, float *grad_value, float *grad_sampling_loc, - float *grad_attn_weight) { - KERNEL_CHECK( - MLUUnion1KernelMsDeformAttnBackwardDefault<<>>( - data_value, spatial_shapes, data_level_start_index, data_sampling_loc, - data_attn_weight, grad_output, batch, spatial_size, num_heads, - channels, num_levels, num_query, num_points, grad_value, - grad_sampling_loc, grad_attn_weight)); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/ms_deform_attn_forward/ms_deform_attn_forward.h b/kernels/ms_deform_attn_forward/ms_deform_attn_forward.h deleted file mode 100644 index 942601345..000000000 --- a/kernels/ms_deform_attn_forward/ms_deform_attn_forward.h +++ /dev/null @@ -1,59 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#ifndef KERNELS_MS_DEFORM_ATTN_FORWARD_MS_DEFORM_ATTN_FORWARD_H_ -#define KERNELS_MS_DEFORM_ATTN_FORWARD_MS_DEFORM_ATTN_FORWARD_H_ - -#include "kernels/kernel.h" -#include "mlu_op.h" - -#define MIN(a, b) (((a) < (b)) ? (a) : (b)) -#define MS_DEFORM_ATTN_FORWARD_HEADVECTOR 1 - -template -__mlu_global__ void MLUKernelMsDeformAttnForwardDefault( - const char *data_value_gdram, const char *data_spatial_shapes_gdram, - const char *data_level_start_index_gdram, - const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram, - const int batch_size, const int num_keys, const int num_heads, - const int channels, const int num_levels, const int num_queries, - const int num_points, char *data_col_gdram); - -template -__mlu_global__ void MLUKernelMsDeformAttnForwardSmallChannel( - const char *data_value_gdram, const char *data_spatial_shapes_gdram, - const char *data_level_start_index_gdram, - const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram, - const int batch_size, const int num_keys, const int num_heads, - const int channels, const int num_levels, const int num_queries, - const int num_points, char *data_col_gdram); - -template -__mlu_global__ void MLUKernelMsDeformAttnForwardFast( - const char *data_value_gdram, const char *data_spatial_shapes_gdram, - const char *data_level_start_index_gdram, - const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram, - const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, const int32_t num_queries, - const int32_t num_points, char *data_col_gdram); - -#endif // KERNELS_MS_DEFORM_ATTN_FORWARD_MS_DEFORM_ATTN_FORWARD_H_ diff --git a/kernels/ms_deform_attn_forward/ms_deform_attn_forward.mlu b/kernels/ms_deform_attn_forward/ms_deform_attn_forward.mlu deleted file mode 100644 index 22346b76d..000000000 --- a/kernels/ms_deform_attn_forward/ms_deform_attn_forward.mlu +++ /dev/null @@ -1,340 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "kernels/ms_deform_attn_forward/ms_deform_attn_forward.h" - -#include "core/context.h" -#include "core/logging.h" -#include "core/gen_case.h" -#include "core/runtime/device.h" -#include "core/tensor.h" -#include "core/tool.h" -#include "core/type.h" -#include "kernels/debug.h" -#include "kernels/kernel.h" -#include "kernels/utils/cnnl_helper.h" - -typedef enum { - /*!< Index is invalid. */ - MS_DEFORM_ATTN_FORWARD_INVALID = 0, - /*!< MLUKernelMsDeformAttnForwardDefault */ - MS_DEFORM_ATTN_FORWARD_DEFAULT = 1, - /*!< MLUKernelMsDeformAttnForwardSmallChannel */ - MS_DEFORM_ATTN_FORWARD_SMALL_CHANNEL = 2, - /*!< MLUKernelMsDeformAttnForwardFast */ - MS_DEFORM_ATTN_FORWARD_FAST = 3, -} MsDeformAttnForwardPolicy; - -MsDeformAttnForwardPolicy msDeformAttnForwardPolicyFunc( - const mluOpHandle_t handle, cnrtDim3_t *k_dims, cnrtFunctionType_t *k_type, - const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, const int32_t num_queries, - const int32_t num_points) { - // start U1 task - k_dims->x = mluop::runtime::getCoreNumOfEachUnionCapability(handle); - k_dims->y = - MIN((batch_size * num_queries * num_heads + k_dims->x - 1) / k_dims->x, - mluop::runtime::getClusterLimitCapability(handle)); - k_dims->z = 1; - - *k_type = CNRT_FUNC_TYPE_UNION1; - - int32_t nlp = num_levels * num_points; - int32_t nlpc = num_levels * num_points * channels; - - if (handle->arch == MLUOP_MLU370 && nlp <= 128 && nlpc <= 12288) { - return MS_DEFORM_ATTN_FORWARD_FAST; - } else if (handle->arch == MLUOP_MLU590 && nlp <= 128 && nlpc <= 8192) { - return MS_DEFORM_ATTN_FORWARD_FAST; - } else if (nlp * 3 * sizeof(int32_t) > handle->nram_size) { - return MS_DEFORM_ATTN_FORWARD_DEFAULT; - } else if (channels > handle->nram_size / 12 / sizeof(float) || - channels > 96 || channels < 16) { - return MS_DEFORM_ATTN_FORWARD_DEFAULT; - } else { - return MS_DEFORM_ATTN_FORWARD_SMALL_CHANNEL; - } -} - -static mluOpStatus_t paramcheck( - const mluOpTensorDescriptor_t data_value_desc, - const mluOpTensorDescriptor_t data_spatial_shapes_desc, - const mluOpTensorDescriptor_t data_level_start_index_desc, - const mluOpTensorDescriptor_t data_sampling_loc_desc, - const mluOpTensorDescriptor_t data_attn_weight_desc, - const mluOpTensorDescriptor_t data_col_desc) { - // check tensor dim - // params data_value: [bs, num_keys, num_heads, channels] - PARAM_CHECK_EQ("[mluOpMsDeformAttnForward]", data_value_desc->dim, 4); - // params data_spatial_shapes: [num_levels, 2] - PARAM_CHECK_EQ("[mluOpMsDeformAttnForward]", data_spatial_shapes_desc->dim, - 2); - PARAM_CHECK_EQ("[mluOpMsDeformAttnForward]", - data_spatial_shapes_desc->dims[1], 2); - // params data_level_start_index: [num_levels] - PARAM_CHECK_EQ("[mluOpMsDeformAttnForward]", data_level_start_index_desc->dim, - 1); - // params data_sampling_loc: - // [bs, num_queries, num_heads, num_levels, num_points, 2] - PARAM_CHECK_EQ("[mluOpMsDeformAttnForward]", data_sampling_loc_desc->dim, 6); - PARAM_CHECK_EQ("[mluOpMsDeformAttnForward]", data_sampling_loc_desc->dims[5], - 2); - // params data_attn_weight: - // [bs, num_queries, num_heads, num_levels, num_points] - PARAM_CHECK_EQ("[mluOpMsDeformAttnForward]", data_attn_weight_desc->dim, 5); - // params data_col: [bs, num_queries, num_heads, channels] - PARAM_CHECK_EQ("[mluOpMsDeformAttnForward]", data_col_desc->dim, 4); - // check tensor shape - PARAM_CHECK("[mluOpMsDeformAttnForward]", - (data_value_desc->dims[0] == data_col_desc->dims[0]) && - (data_sampling_loc_desc->dims[0] == data_col_desc->dims[0]) && - (data_attn_weight_desc->dims[0] == data_col_desc->dims[0])); - PARAM_CHECK("[mluOpMsDeformAttnForward]", - (data_value_desc->dims[2] == data_col_desc->dims[2]) && - (data_sampling_loc_desc->dims[2] == data_col_desc->dims[2]) && - (data_attn_weight_desc->dims[2] == data_col_desc->dims[2])); - PARAM_CHECK("[mluOpMsDeformAttnForward]", - data_value_desc->dims[3] == data_col_desc->dims[3]); - PARAM_CHECK("[mluOpMsDeformAttnForward]", - (data_spatial_shapes_desc->dims[0] == - data_level_start_index_desc->dims[0]) && - (data_spatial_shapes_desc->dims[0] == - data_sampling_loc_desc->dims[3]) && - (data_spatial_shapes_desc->dims[0] == - data_attn_weight_desc->dims[3])); - PARAM_CHECK("[mluOpMsDeformAttnForward]", - (data_sampling_loc_desc->dims[1] == data_col_desc->dims[1]) && - (data_attn_weight_desc->dims[1] == data_col_desc->dims[1])); - PARAM_CHECK("[mluOpMsDeformAttnForward]", data_sampling_loc_desc->dims[4] == - data_attn_weight_desc->dims[4]); - // check tensor datatype - PARAM_CHECK("[mluOpMsDeformAttnForward]", - data_value_desc->dtype == MLUOP_DTYPE_FLOAT); - PARAM_CHECK("[mluOpMsDeformAttnForward]", - data_spatial_shapes_desc->dtype == MLUOP_DTYPE_INT32); - PARAM_CHECK("[mluOpMsDeformAttnForward]", - data_level_start_index_desc->dtype == MLUOP_DTYPE_INT32); - // data_value, data_sampling_loc, data_attn_weight, - // data_col datatype must be the same - PARAM_CHECK("[mluOpMsDeformAttnForward]", - (data_value_desc->dtype == data_col_desc->dtype) && - (data_sampling_loc_desc->dtype == data_col_desc->dtype) && - (data_attn_weight_desc->dtype == data_col_desc->dtype)); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpMsDeformAttnForward( - mluOpHandle_t handle, const mluOpTensorDescriptor_t data_value_desc, - const void *data_value, - const mluOpTensorDescriptor_t data_spatial_shapes_desc, - const void *data_spatial_shapes, - const mluOpTensorDescriptor_t data_level_start_index_desc, - const void *data_level_start_index, - const mluOpTensorDescriptor_t data_sampling_loc_desc, - const void *data_sampling_loc, - const mluOpTensorDescriptor_t data_attn_weight_desc, - const void *data_attn_weight, const int32_t im2col_step, - const mluOpTensorDescriptor_t data_col_desc, void *data_col) { - // handle and desc ptr check null - PARAM_CHECK("[mluOpMsDeformAttnForward]", handle != NULL); - PARAM_CHECK("[mluOpMsDeformAttnForward]", data_value_desc != NULL); - PARAM_CHECK("[mluOpMsDeformAttnForward]", data_spatial_shapes_desc != NULL); - PARAM_CHECK("[mluOpMsDeformAttnForward]", - data_level_start_index_desc != NULL); - PARAM_CHECK("[mluOpMsDeformAttnForward]", data_sampling_loc_desc != NULL); - PARAM_CHECK("[mluOpMsDeformAttnForward]", data_attn_weight_desc != NULL); - PARAM_CHECK("[mluOpMsDeformAttnForward]", data_col_desc != NULL); - // check params - mluOpStatus_t paramcheck_status = paramcheck( - data_value_desc, data_spatial_shapes_desc, data_level_start_index_desc, - data_sampling_loc_desc, data_attn_weight_desc, data_col_desc); - if (paramcheck_status != MLUOP_STATUS_SUCCESS) { - return paramcheck_status; - } - size_t data_value_element_num = mluOpGetTensorElementNum(data_value_desc); - size_t data_sampling_loc_element_num = - mluOpGetTensorElementNum(data_sampling_loc_desc); - size_t data_col_element_num = mluOpGetTensorElementNum(data_col_desc); - // check large tensor - TENSOR_NUM_CHECK("[mluOpMsDeformAttnForward]", data_value_element_num, - LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK("[mluOpMsDeformAttnForward]", data_sampling_loc_element_num, - LARGE_TENSOR_NUM, ""); - TENSOR_NUM_CHECK("[mluOpMsDeformAttnForward]", data_col_element_num, - LARGE_TENSOR_NUM, ""); - const int32_t batch_size = data_value_desc->dims[0]; - const int32_t num_keys = data_value_desc->dims[1]; - const int32_t num_heads = data_value_desc->dims[2]; - const int32_t channels = data_value_desc->dims[3]; - const int32_t num_levels = data_spatial_shapes_desc->dims[0]; - const int32_t num_queries = data_sampling_loc_desc->dims[1]; - const int32_t num_points = data_sampling_loc_desc->dims[4]; - // check element num zero - if (batch_size == 0 || num_heads == 0 || channels == 0 || num_queries == 0) { - LOG(ERROR) << "[mluOpMsDeformAttnForward] Check failed: element num zero."; - return MLUOP_STATUS_BAD_PARAM; - } - if (num_levels == 0 || num_points == 0) { - VLOG(5) << "cnnlFill_v3 start."; - const float fill_value = 0.0f; - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(data_col_desc, - cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value, - cnnl_output_desc, data_col)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - VLOG(5) << "cnnlFill_v3 end."; - VLOG(5) << "mluOpMsDeformAttnForward skip zero element."; - return MLUOP_STATUS_SUCCESS; - } - // check im2col_step param - const int32_t im2col_step_ = MIN(batch_size, im2col_step); - PARAM_CHECK("[mluOpMsDeformAttnForward]", im2col_step_ > 0); - PARAM_CHECK("[mluOpMsDeformAttnForward]", batch_size % im2col_step_ == 0); - PARAM_CHECK("[mluOpMsDeformAttnForward]", data_value != NULL); - PARAM_CHECK("[mluOpMsDeformAttnForward]", data_spatial_shapes != NULL); - PARAM_CHECK("[mluOpMsDeformAttnForward]", data_level_start_index != NULL); - PARAM_CHECK("[mluOpMsDeformAttnForward]", data_sampling_loc != NULL); - PARAM_CHECK("[mluOpMsDeformAttnForward]", data_attn_weight != NULL); - PARAM_CHECK("[mluOpMsDeformAttnForward]", data_col != NULL); - // generate mluOpMsDeformAttnForward prototxt start! - if (MLUOP_GEN_CASE_ON_NEW) { - GEN_CASE_START("ms_deform_attn_forward"); - // set handle dump mlu output - GEN_CASE_HANDLE(handle); - GEN_CASE_DATA(true, "data_value", data_value, data_value_desc, 10, -10); - GEN_CASE_DATA(true, "data_spatial_shapes", data_spatial_shapes, - data_spatial_shapes_desc, 10, -10); - GEN_CASE_DATA(true, "data_level_start_index", data_level_start_index, - data_level_start_index_desc, 10, -10); - GEN_CASE_DATA(true, "data_sampling_loc", data_sampling_loc, - data_sampling_loc_desc, 10, -10); - GEN_CASE_DATA(true, "data_attn_weight", data_attn_weight, - data_attn_weight_desc, 10, -10); - GEN_CASE_DATA(false, "data_col", data_col, data_col_desc, 0, 0); - GEN_CASE_OP_PARAM_SINGLE(0, "ms_deform_attn_forward", "im2col_step", - im2col_step); - GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0); - } - cnrtDim3_t k_dims; - cnrtFunctionType_t k_type; - MsDeformAttnForwardPolicy policy = msDeformAttnForwardPolicyFunc( - handle, &k_dims, &k_type, batch_size, num_keys, num_heads, channels, - num_levels, num_queries, num_points); - switch (policy) { - default: { - VLOG(5) << "[mluOpMsDeformAttnForward] Policy not supported"; - return MLUOP_STATUS_BAD_PARAM; - }; break; - case MS_DEFORM_ATTN_FORWARD_DEFAULT: { - switch (k_type) { - default: { - VLOG(5) << "Not Implemented"; - break; - } - case CNRT_FUNC_TYPE_BLOCK: { - VLOG(5) - << "Launch Kernel MLUKernelMsDeformAttnForwardDefault<<>>"; - KERNEL_CHECK( - (MLUKernelMsDeformAttnForwardDefault - <<queue>>>( - (char *)data_value, (char *)data_spatial_shapes, - (char *)data_level_start_index, (char *)data_sampling_loc, - (char *)data_attn_weight, batch_size, num_keys, num_heads, - channels, num_levels, num_queries, num_points, - (char *)data_col))); - break; - } - case CNRT_FUNC_TYPE_UNION1: { - VLOG(5) << "Launch Kernel MLUKernelMsDeformAttnForwardDefault<<>>"; - KERNEL_CHECK( - (MLUKernelMsDeformAttnForwardDefault - <<queue>>>( - (char *)data_value, (char *)data_spatial_shapes, - (char *)data_level_start_index, (char *)data_sampling_loc, - (char *)data_attn_weight, batch_size, num_keys, num_heads, - channels, num_levels, num_queries, num_points, - (char *)data_col))); - break; - } - } - break; - } - case MS_DEFORM_ATTN_FORWARD_SMALL_CHANNEL: { - switch (k_type) { - default: { - VLOG(5) << "Not Implemented"; - break; - } - case CNRT_FUNC_TYPE_BLOCK: { - VLOG(5) << "Launch Kernel " - "MLUKernelMsDeformAttnForwardSmallChannel<<>>"; - KERNEL_CHECK( - (MLUKernelMsDeformAttnForwardSmallChannel - <<queue>>>( - (char *)data_value, (char *)data_spatial_shapes, - (char *)data_level_start_index, (char *)data_sampling_loc, - (char *)data_attn_weight, batch_size, num_keys, num_heads, - channels, num_levels, num_queries, num_points, - (char *)data_col))); - break; - } - case CNRT_FUNC_TYPE_UNION1: { - VLOG(5) << "Launch Kernel " - "MLUKernelMsDeformAttnForwardSmallChannel<<>>"; - KERNEL_CHECK( - (MLUKernelMsDeformAttnForwardSmallChannel - <<queue>>>( - (char *)data_value, (char *)data_spatial_shapes, - (char *)data_level_start_index, (char *)data_sampling_loc, - (char *)data_attn_weight, batch_size, num_keys, num_heads, - channels, num_levels, num_queries, num_points, - (char *)data_col))); - break; - } - } - break; - } - case MS_DEFORM_ATTN_FORWARD_FAST: { - VLOG(5) << "Launch Kernel MLUKernelMsDeformAttnForwardFast<<>>"; - KERNEL_CHECK((MLUKernelMsDeformAttnForwardFast - <<queue>>>( - (char *)data_value, (char *)data_spatial_shapes, - (char *)data_level_start_index, - (char *)data_sampling_loc, (char *)data_attn_weight, - batch_size, num_keys, num_heads, channels, num_levels, - num_queries, num_points, (char *)data_col))); - break; - } - } - GEN_CASE_END(); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/ms_deform_attn_forward/ms_deform_attn_utils.h b/kernels/ms_deform_attn_forward/ms_deform_attn_utils.h deleted file mode 100644 index 4e9360927..000000000 --- a/kernels/ms_deform_attn_forward/ms_deform_attn_utils.h +++ /dev/null @@ -1,398 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#ifndef KERNELS_MS_DEFORM_ATTN_FORWARD_MS_DEFORM_ATTN_UTILS_H_ -#define KERNELS_MS_DEFORM_ATTN_FORWARD_MS_DEFORM_ATTN_UTILS_H_ - -#include -#include - -#include "kernels/kernel.h" -#include "kernels/utils/common.h" - -#define BIT_COLLECT_PAD (8) -#define BACKWARD_MAX_NQ_NL_NP (1024) - -#if (__BANG_ARCH__ >= 372) - -__mlu_func__ void broadcastSpatialHW( - float* spatial_offset_bd_nram, // (num_levels, num_points) - float* spatial_h_bd_nram, // (num_levels, num_points) - float* spatial_w_bd_nram, // (num_levels, num_points) - int32_t* spatial_shapes_nram, // (num_levels, 2) - int32_t* spatial_offset_nram, // (num_levels) - const int32_t num_levels, const int32_t num_points) { - __bang_int322float((float*)spatial_shapes_nram, spatial_shapes_nram, - num_levels * 2, 0); - __memcpy(spatial_h_bd_nram, spatial_shapes_nram, sizeof(float), NRAM2NRAM, - sizeof(float), num_points - 1, num_points * sizeof(float), - num_levels - 1, 0, num_points - 1, 2 * sizeof(float), - num_levels - 1); - __memcpy(spatial_w_bd_nram, (float*)spatial_shapes_nram + 1, sizeof(float), - NRAM2NRAM, sizeof(float), num_points - 1, num_points * sizeof(float), - num_levels - 1, 0, num_points - 1, 2 * sizeof(float), - num_levels - 1); - __bang_int322float((float*)spatial_offset_nram, spatial_offset_nram, - num_levels, 0); - __memcpy(spatial_offset_bd_nram, spatial_offset_nram, sizeof(float), - NRAM2NRAM, sizeof(float), num_points - 1, num_points * sizeof(float), - num_levels - 1, 0, num_points - 1, sizeof(float), num_levels - 1); -} - -template -__mlu_func__ void prepareLoopV2( - int32_t* seq_nram, T* zeros_nram, int32_t* spatial_offset_nram, - int32_t* spatial_hw_nram, int8_t* mask_x_nram, int8_t* mask_y_nram, - T* spatial_offset_bd_nram, T* spatial_h_bd_nram, T* spatial_w_bd_nram, - T* value_sram, const void* data_level_start_index_gdram, - const void* data_spatial_shapes_gdram, const int32_t num_keys, - const int32_t num_levels, const int32_t num_points, - const int32_t max_deal_n, const int32_t mask_size, const int32_t channels) { - if (seq_nram != nullptr) { - for (int i = 0; i < 8; i++) { - seq_nram[i] = i; - } - __bang_add_scalar(seq_nram + 8, seq_nram, 8, 8); // [0, 7] + 8 - __bang_add_scalar(seq_nram + 16, seq_nram, 16, 16); // [0, 15] + 16 - __bang_add_scalar(seq_nram + 32, seq_nram, 32, 32); // [0, 31] + 32 - __bang_add_scalar(seq_nram + 64, seq_nram, 64, 64); - __bang_add_scalar(seq_nram + 128, seq_nram, 128, 128); - __bang_add_scalar(seq_nram + 256, seq_nram, 256, 256); - __bang_add_scalar(seq_nram + 512, seq_nram, 512, 512); // [0, 511] + 512 - } - __bang_write_value(zeros_nram, channels, (T)0); - __bang_write_value(mask_x_nram, mask_size, (char)0x55); - __bang_write_value(mask_y_nram, mask_size, (char)0xAA); - __memcpy_async(spatial_offset_nram, data_level_start_index_gdram, - num_levels * sizeof(int32_t), GDRAM2NRAM); - __memcpy_async(spatial_hw_nram, data_spatial_shapes_gdram, - num_levels * 2 * sizeof(int32_t), GDRAM2NRAM); - __sync_io_move_compute(); - broadcastSpatialHW(spatial_offset_bd_nram, spatial_h_bd_nram, - spatial_w_bd_nram, spatial_hw_nram, spatial_offset_nram, - num_levels, num_points); -} - -/* - Split batch*head into taskDimY, the split num_queries into coreDim. - This plan is used to staying data_value on SRAM. -*/ -__mlu_func__ void splitTaskV1( - int32_t& cluster_begin_batch_head, int32_t& cluster_act_batch_head, - int32_t& cluster_end_batch_head, int32_t& core_begin_query, - int32_t& core_act_query, int32_t& core_loop_num, int32_t& core_step_query, - const int32_t max_deal_n, const int32_t batch_size, const int32_t num_keys, - const int32_t num_heads, const int32_t channels, const int32_t num_levels, - const int32_t num_queries, const int32_t num_points) { - // split batch*head into taskDimY - int32_t batch_head = batch_size * num_heads; - int32_t cluster_avg_batch_head = (batch_head + taskDimY - 1) / taskDimY; - cluster_begin_batch_head = taskIdY * cluster_avg_batch_head; - cluster_act_batch_head = - std::min(cluster_avg_batch_head, batch_head - cluster_begin_batch_head); - cluster_end_batch_head = cluster_begin_batch_head + cluster_act_batch_head; - // split query into coreDim - int32_t core_avg_query = (num_queries + coreDim - 1) / coreDim; - core_begin_query = coreId * core_avg_query; - core_act_query = std::min(num_queries - core_begin_query, core_avg_query); - core_loop_num = (core_act_query + max_deal_n - 1) / max_deal_n; - core_step_query = core_loop_num > 0 - ? (core_act_query + core_loop_num - 1) / core_loop_num - : 0; -} - -/* - Split num_queries into taskDim. - Each core iterate in batch * head -*/ -__mlu_func__ void splitTaskV2( - int32_t& cluster_begin_batch_head, int32_t& cluster_act_batch_head, - int32_t& cluster_end_batch_head, int32_t& core_begin_query, - int32_t& core_act_query, int32_t& core_loop_num, int32_t& core_step_query, - const int32_t max_deal_n, const int32_t batch_size, const int32_t num_keys, - const int32_t num_heads, const int32_t channels, const int32_t num_levels, - const int32_t num_queries, const int32_t num_points) { - // not split batch*head - int32_t batch_head = batch_size * num_heads; - cluster_begin_batch_head = 0; - cluster_act_batch_head = batch_head; - cluster_end_batch_head = batch_head; - // split query into taskDim - int32_t core_avg_query = (num_queries + taskDim - 1) / taskDim; - core_begin_query = taskId * core_avg_query; - core_act_query = std::min(num_queries - core_begin_query, core_avg_query); - core_loop_num = (core_act_query + max_deal_n - 1) / max_deal_n; - core_step_query = core_loop_num > 0 - ? (core_act_query + core_loop_num - 1) / core_loop_num - : 0; -} - -template -__mlu_func__ void computePolationWeightOffsetCond( - int32_t* data_offset_nram, T* weight_polation_nram, - T* cond_point_polation_nram, T* cond_point_valid_nram, T* loc_nram, - int8_t* mask_x_nram, int8_t* mask_y_nram, T* spatial_offset_bd_nram, - T* spatial_w_bd_nram, T* spatial_h_bd_nram, T* delata_xy_nram, T* buf_nram, - const bool cached_delta_xy, const int32_t deal_n, const int32_t num_levels, - const int32_t num_points, const int32_t num_heads, const int32_t channels) { - int32_t total_points = deal_n * num_levels * num_points; - int32_t block_points = num_levels * num_points; - T* buf_x_nram = buf_nram; - T* buf_y_nram = buf_nram + total_points; - T* buf_cond_nram = buf_nram + 2 * total_points; - T* buf_x_floor = buf_nram + 2 * total_points; - T* buf_x_ceil = buf_nram + 3 * total_points; - T* buf_y_floor = buf_nram + 4 * total_points; - T* buf_y_ceil = buf_nram + 5 * total_points; - //================================================================================================ - int32_t total_coord_pad = PAD_UP(total_points * 2, BIT_COLLECT_PAD); - __bang_collect_bitindex(buf_x_nram, loc_nram, mask_x_nram, total_coord_pad); - __bang_collect_bitindex(buf_y_nram, loc_nram, mask_y_nram, total_coord_pad); - // x = loc_x * spatial_w - 0.5; y = loc_y * spatial_h - 0.5; - __bang_fusion(FUSION_FMS, buf_x_nram, buf_x_nram, spatial_w_bd_nram, (T)0.5, - total_points, block_points); - __bang_fusion(FUSION_FMS, buf_y_nram, buf_y_nram, spatial_h_bd_nram, (T)0.5, - total_points, block_points); - //================================================================================================ - // get point condition. use buf0, buf1, buf2 - // (x > -1 && y > -1 && y < spatial_h && x < spatial_w) - __bang_gt_scalar(cond_point_valid_nram, buf_x_nram, (T)-1.0, total_points); - __bang_gt_scalar(buf_cond_nram, buf_y_nram, (T)-1.0, total_points); - __bang_and(cond_point_valid_nram, cond_point_valid_nram, buf_cond_nram, - total_points); - __bang_cycle_lt(buf_cond_nram, buf_x_nram, spatial_w_bd_nram, total_points, - block_points); - __bang_and(cond_point_valid_nram, cond_point_valid_nram, buf_cond_nram, - total_points); - __bang_cycle_lt(buf_cond_nram, buf_y_nram, spatial_h_bd_nram, total_points, - block_points); - __bang_and(cond_point_valid_nram, cond_point_valid_nram, buf_cond_nram, - total_points); - //================================================================================================ - __bang_floor(buf_x_floor, buf_x_nram, total_points); - __bang_add_scalar(buf_x_ceil, buf_x_floor, 1.0, total_points); - __bang_floor(buf_y_floor, buf_y_nram, total_points); - __bang_add_scalar(buf_y_ceil, buf_y_floor, 1.0, total_points); - T* cond_point_polation_nram_tl = cond_point_polation_nram; - T* cond_point_polation_nram_bl = cond_point_polation_nram + total_points; - T* cond_point_polation_nram_tr = cond_point_polation_nram + 2 * total_points; - T* cond_point_polation_nram_br = cond_point_polation_nram + 3 * total_points; - T* cond_point_polation_nram_cond1 = weight_polation_nram; - T* cond_point_polation_nram_cond2 = weight_polation_nram + total_points; - T* cond_point_polation_nram_cond3 = weight_polation_nram + 2 * total_points; - T* cond_point_polation_nram_cond4 = weight_polation_nram + 3 * total_points; - __bang_ge_scalar(cond_point_polation_nram_cond1, buf_x_floor, (T)0, - total_points); - __bang_cycle_lt(cond_point_polation_nram_cond2, buf_x_ceil, spatial_w_bd_nram, - total_points, block_points); - __bang_ge_scalar(cond_point_polation_nram_cond3, buf_y_floor, (T)0, - total_points); - __bang_cycle_lt(cond_point_polation_nram_cond4, buf_y_ceil, spatial_h_bd_nram, - total_points, block_points); - __bang_and(cond_point_polation_nram_tl, cond_point_polation_nram_cond1, - cond_point_polation_nram_cond4, total_points); - __bang_and(cond_point_polation_nram_bl, cond_point_polation_nram_cond1, - cond_point_polation_nram_cond3, total_points); - __bang_and(cond_point_polation_nram_tr, cond_point_polation_nram_cond2, - cond_point_polation_nram_cond4, total_points); - __bang_and(cond_point_polation_nram_br, cond_point_polation_nram_cond2, - cond_point_polation_nram_cond3, total_points); - //================================================================================================ - // get polation weight. - T* buf_dx = (T*)data_offset_nram; - T* buf_dy = buf_dx + total_points; - T* buf_dx_1 = buf_dy + total_points; - T* buf_dy_1 = buf_dx_1 + total_points; - T* weight_polation_nram_1 = weight_polation_nram; - T* weight_polation_nram_2 = weight_polation_nram + 1 * total_points; - T* weight_polation_nram_3 = weight_polation_nram + 2 * total_points; - T* weight_polation_nram_4 = weight_polation_nram + 3 * total_points; - // T* weight_polation_nram_buf = buf_nram + 4 * total_points; - __bang_sub(buf_dx, buf_x_floor, buf_x_nram, total_points); // -dx - __bang_sub(buf_dy, buf_y_floor, buf_y_nram, total_points); // -dy - __bang_fusion(FUSION_FSS, buf_dx_1, buf_x_nram, buf_x_floor, - (T)1.0, // dx - 1 - total_points, total_points); - __bang_fusion(FUSION_FSS, buf_dy_1, buf_y_nram, buf_y_floor, - (T)1.0, // dy - 1 - total_points, total_points); - __bang_mul(weight_polation_nram_1, buf_dx_1, buf_dy, - total_points); // (-dy)(dx-1) - __bang_mul(weight_polation_nram_2, buf_dx_1, buf_dy_1, - total_points); // (dx-1)*(dy-1) - __bang_mul(weight_polation_nram_3, buf_dx, buf_dy, - total_points); // (-dx)*(-dy) - __bang_mul(weight_polation_nram_4, buf_dx, buf_dy_1, - total_points); // (-dx)*(dy-1) - if (cached_delta_xy) { - __bang_sub(delata_xy_nram, buf_x_nram, buf_x_floor, total_points); // dx - __bang_add_scalar(delata_xy_nram + total_points, buf_dx, 1, - total_points); // 1-dx - __bang_sub(delata_xy_nram + 2 * total_points, buf_y_nram, buf_y_floor, - total_points); // dy - __bang_add_scalar(delata_xy_nram + 3 * total_points, buf_dy, 1, - total_points); // 1-dy - } - //================================================================================================ - // correct the x,y in [0, w-1] and [0, h-1] - T* spatial_w1_bd_nram = buf_nram; - T* spatial_h1_bd_nram = buf_nram + block_points; - __bang_sub_scalar(spatial_w1_bd_nram, spatial_w_bd_nram, (T)1, block_points); - __bang_sub_scalar(spatial_h1_bd_nram, spatial_h_bd_nram, (T)1, block_points); - __bang_maxeq_scalar(buf_x_floor, buf_x_floor, (T)0, total_points); - __bang_maxeq_scalar(buf_x_ceil, buf_x_ceil, (T)0, total_points); - __bang_cycle_minequal(buf_x_floor, buf_x_floor, spatial_w1_bd_nram, - total_points, block_points); - __bang_cycle_minequal(buf_x_ceil, buf_x_ceil, spatial_w1_bd_nram, - total_points, block_points); - __bang_maxeq_scalar(buf_y_floor, buf_y_floor, (T)0, total_points); - __bang_maxeq_scalar(buf_y_ceil, buf_y_ceil, (T)0, total_points); - __bang_cycle_minequal(buf_y_floor, buf_y_floor, spatial_h1_bd_nram, - total_points, block_points); - __bang_cycle_minequal(buf_y_ceil, buf_y_ceil, spatial_h1_bd_nram, - total_points, block_points); - //================================================================================================ - // offset = y*w + x - T* buf_hw_offset = buf_nram; - T* data_offset_nram_tl = (T*)data_offset_nram; - T* data_offset_nram_bl = data_offset_nram_tl + total_points; - T* data_offset_nram_tr = data_offset_nram_bl + total_points; - T* data_offset_nram_br = data_offset_nram_tr + total_points; - // y_ceil*w + offset + x_floor - __bang_fusion(FUSION_FMA, buf_hw_offset, buf_y_ceil, spatial_w_bd_nram, - spatial_offset_bd_nram, total_points, block_points); - __bang_add(data_offset_nram_tl, buf_hw_offset, buf_x_floor, total_points); - // y_ceil*w + offset + x_ceil - __bang_add(data_offset_nram_tr, buf_hw_offset, buf_x_ceil, total_points); - // y_floor*w + offset + x_foor - __bang_fusion(FUSION_FMA, buf_hw_offset, buf_y_floor, spatial_w_bd_nram, - spatial_offset_bd_nram, total_points, block_points); - __bang_add(data_offset_nram_bl, buf_hw_offset, buf_x_floor, total_points); - // y_floor*w + offset + x_ceil - __bang_add(data_offset_nram_br, buf_hw_offset, buf_x_ceil, total_points); - __bang_float2int32(data_offset_nram, (T*)data_offset_nram, total_points * 4, - 0); - int32_t stride = num_heads * channels * sizeof(T); - __bang_mul_scalar(data_offset_nram, data_offset_nram, stride, - total_points * 4); - //================================================================================================ - // merge conditions and clear weight, cast conditions to bits - T* cond_point_polation_nram_tmp = buf_nram; - __bang_cycle_and(cond_point_polation_nram, cond_point_polation_nram, - cond_point_valid_nram, 4 * total_points, total_points); - __bang_float2int32((int32_t*)cond_point_polation_nram_tmp, - cond_point_polation_nram, total_points * 4, 0); - __bang_mul_scalar((int32_t*)cond_point_polation_nram_tmp, - (int32_t*)cond_point_polation_nram_tmp, (int32_t)0xffffffff, - total_points * 4); - __bang_band((char*)weight_polation_nram, (char*)weight_polation_nram, - (char*)cond_point_polation_nram_tmp, - total_points * 4 * sizeof(float)); -} - -/* - compute condition, polation_weight, offset and store to SRAM. - cache_delta_xy and cache_point_valid is true in backward, false in forward. -*/ -template -__mlu_func__ void stageOneLoop( - T* sampling_loc_gdram, T* weight_attn_gdram, int32_t* data_offset_nram, - void* delata_xy_nram, T* weight_polation_nram, T* cond_point_polation_nram, - T* cond_point_valid_nram, T* loc_nram, T* buf_nram, T* buf_nram_end, - int8_t* mask_x_nram, int8_t* mask_y_nram, T* spatial_offset_bd_nram, - T* spatial_w_bd_nram, T* spatial_h_bd_nram, int32_t* spatial_offset_nram, - int32_t* spatial_hw_nram, int32_t* data_offset_sram, void* delta_xy_sram, - T* weight_polation_sram, T* weight_attn_sram, T* cond_point_polation_sram, - const bool cache_delta_xy, const bool cache_point_valid, - const int32_t total_deal_n, const int32_t max_deal_n, - const int32_t num_heads, const int32_t channels, const int32_t num_levels, - const int32_t num_points, const int32_t input_stride_2, - const int32_t input_stride_3) { - int32_t loop_num = (total_deal_n + max_deal_n - 1) / max_deal_n; - int32_t num_levels_points = num_levels * num_points; - int32_t sram_offset = 0; - int32_t sram_dst_stride = total_deal_n * num_levels_points * sizeof(T); - for (int i = 0; i < loop_num; i++) { - int32_t deal_n = std::min(total_deal_n - i * max_deal_n, max_deal_n); - int32_t deal_point_num = deal_n * num_levels_points; - int32_t copy_size = deal_point_num * sizeof(T); - __memcpy(loc_nram, sampling_loc_gdram + i * max_deal_n * input_stride_3 * 2, - input_stride_2 * 2 * sizeof(T), GDRAM2NRAM, - input_stride_2 * 2 * sizeof(T), input_stride_3 * 2 * sizeof(T), - deal_n - 1); - computePolationWeightOffsetCond( - data_offset_nram, weight_polation_nram, cond_point_polation_nram, - cond_point_valid_nram, loc_nram, mask_x_nram, mask_y_nram, - spatial_offset_bd_nram, spatial_w_bd_nram, spatial_h_bd_nram, - (T*)delata_xy_nram, buf_nram, cache_delta_xy, deal_n, num_levels, - num_points, num_heads, channels); - __memcpy(data_offset_sram + sram_offset, data_offset_nram, copy_size, - NRAM2SRAM, sram_dst_stride, copy_size, 3); - __memcpy(weight_polation_sram + sram_offset, weight_polation_nram, - copy_size, NRAM2SRAM, sram_dst_stride, copy_size, 3); - __memcpy(cond_point_polation_sram + sram_offset, cond_point_polation_nram, - copy_size, NRAM2SRAM, sram_dst_stride, copy_size, 3); - if (cache_point_valid) { - __memcpy(cond_point_polation_sram + 4 * total_deal_n * num_levels_points + - sram_offset, - cond_point_valid_nram, copy_size, NRAM2SRAM); - } - if (cache_delta_xy) { - __memcpy((T*)delta_xy_sram + sram_offset, delata_xy_nram, copy_size, - NRAM2SRAM, sram_dst_stride, copy_size, 3); - } - __memcpy(buf_nram, weight_attn_gdram + i * max_deal_n * input_stride_3, - input_stride_2 * sizeof(T), GDRAM2NRAM, input_stride_2 * sizeof(T), - input_stride_3 * sizeof(T), deal_n - 1); - __bang_float2int32((int32_t*)cond_point_valid_nram, cond_point_valid_nram, - deal_point_num, 0); - __bang_mul_scalar((int32_t*)cond_point_valid_nram, - (int32_t*)cond_point_valid_nram, (int32_t)0xffffffff, - deal_point_num); - __bang_band((char*)buf_nram, (char*)buf_nram, (char*)cond_point_valid_nram, - deal_n * num_levels * num_points * sizeof(T)); - __memcpy(weight_attn_sram + sram_offset, buf_nram, copy_size, NRAM2SRAM); - sram_offset += deal_point_num; - } - __sync_io_move_compute(); -} -#endif - -#if (__BANG_ARCH__ == 592) -__mlu_func__ void gatherAsync(void* dst, void* src, unsigned int* offset, - void* mask, int transfer_size, - mluMemcpyDirection_t dir, int dst_stride, - int transfer_num) { - __gather_async(dst, src, offset, mask, transfer_size, dir, dst_stride, - transfer_num); -} - -__mlu_func__ void gatherSync(void* dst, void* src, unsigned int* offset, - void* mask, int transfer_size, - mluMemcpyDirection_t dir, int dst_stride, - int transfer_num) { - __gather(dst, src, offset, mask, transfer_size, dir, dst_stride, - transfer_num); -} -#endif - -#endif diff --git a/kernels/ms_deform_attn_forward/msda_forward_fast_union1.mlu b/kernels/ms_deform_attn_forward/msda_forward_fast_union1.mlu deleted file mode 100644 index 8397bb276..000000000 --- a/kernels/ms_deform_attn_forward/msda_forward_fast_union1.mlu +++ /dev/null @@ -1,1280 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "ms_deform_attn_utils.h" - -#pragma bang walign(64) - -#if (__BANG_ARCH__ >= 372) - -#define MAX_MEMCPY_SEGNUM (65536) -#define NRAM_REMAIN_SIZE (48 * 1024) -#define SRAM_REMAIN_SIZE (32 * 1024) -#define NRAM_AVALIABLE_SIZE (__MLU_NRAM_SIZE__ * 1024 - NRAM_REMAIN_SIZE) -#define WRAM_AVALIABLE_SIZE (__MLU_WRAM_SIZE__ * 1024) -#define SRAM_AVALIABLE_SIZE (__MLU_SRAM_SIZE__ * 1024 - SRAM_REMAIN_SIZE) -#define SRAM_FOR_VALUE_SIZE (SRAM_AVALIABLE_SIZE - 128) - -#ifndef LT_NUM -#define LT_NUM 64 -#endif - -#ifndef WRAM_LT_STRIDE -#define WRAM_LT_STRIDE (__MLU_WRAM_SIZE__ * 1024 / LT_NUM) -#endif - -#ifndef WRAM_ALIGN_SIZE -#define WRAM_ALIGN_SIZE (64) -#endif - -__nram__ char nram_buffer[NRAM_AVALIABLE_SIZE]; -__mlu_shared__ char sram_buffer[SRAM_AVALIABLE_SIZE]; -__wram__ char wram_buffer[WRAM_AVALIABLE_SIZE]; - -template -__mlu_func__ void tileWeight2WramAsync(T* dst, - T* src, // (co, ci) - int32_t co, int32_t ci, int32_t pad_co, - int32_t pad_ci) { - int32_t co_num = co / LT_NUM; - int32_t co_remain = co % LT_NUM; - if (co_num > 0) { - __memcpy_async(dst, src, ci * sizeof(T), NRAM2WRAM, WRAM_LT_STRIDE, - LT_NUM - 1, pad_ci * sizeof(T), co_num - 1, ci * sizeof(T), - LT_NUM - 1, LT_NUM * ci * sizeof(T), co_num - 1); - } - if (co_remain > 0) { - __memcpy_async(dst + co_num * pad_ci, src + co_num * LT_NUM * ci, - ci * sizeof(T), NRAM2WRAM, WRAM_LT_STRIDE, ci * sizeof(T), - co_remain - 1); - } -} - -template -__mlu_func__ void tileWeight2WramSync(T* dst, - T* src, // (co, ci) - int32_t co, int32_t ci, int32_t pad_co, - int32_t pad_ci) { - int32_t co_num = co / LT_NUM; - int32_t co_remain = co % LT_NUM; - if (co_num > 0) { - __memcpy(dst, src, ci * sizeof(T), NRAM2WRAM, WRAM_LT_STRIDE, LT_NUM - 1, - pad_ci * sizeof(T), co_num - 1, ci * sizeof(T), LT_NUM - 1, - LT_NUM * ci * sizeof(T), co_num - 1); - } - if (co_remain > 0) { - __memcpy(dst + co_num * pad_ci, src + co_num * LT_NUM * ci, ci * sizeof(T), - NRAM2WRAM, WRAM_LT_STRIDE, ci * sizeof(T), co_remain - 1); - } -} - -template -__mlu_func__ void isValueContainInfNan(T* input_sram, T* output_sram, - T* nram_buf, bool& value_contain_infnan, - int32_t nram_buf_size, - int32_t data_num) { - int32_t core_avg_num = (data_num + coreDim - 1) / coreDim; - int32_t core_begin_num = core_avg_num * coreId; - int32_t core_act_num = __mluop_min(data_num - core_begin_num, core_avg_num); - int32_t core_step_num = - PAD_DOWN(nram_buf_size - NFU_ALIGN_SIZE, NFU_ALIGN_SIZE) / sizeof(T); - int32_t c = NFU_ALIGN_SIZE / sizeof(T); - int32_t loop_num = (core_act_num + core_step_num - 1) / core_step_num; - int32_t remain_num = (int)(loop_num > 0) * (core_act_num % core_step_num); - T* input_sram_base = input_sram + core_begin_num; - T* nram_out = nram_buf; - T* nram_input = nram_buf + NFU_ALIGN_SIZE / sizeof(T); - T sum = 0; - - if (remain_num > 0) { - int32_t n = (remain_num + c - 1) / c; - __bang_write_value(nram_input + (n - 1) * c, c, (T)0); - } - - for (int32_t i = 0; i < loop_num; i++) { - int32_t deal_num = - __mluop_min(core_step_num, core_act_num - i * core_step_num); - int32_t n = (deal_num + c - 1) / c; - __memcpy(nram_input, input_sram_base + i * core_step_num, - deal_num * sizeof(T), SRAM2NRAM); - __bang_sumpool(nram_out, nram_input, c, n, 1, n, 1, 1, 1); - __bang_sumpool(nram_input, nram_out, 1, c, 1, c, 1, 1, 1); - T tmp = nram_input[0]; - if (isnan(tmp) || isinf(tmp)) { - sum = 1; - break; - } else { - sum = 0; - } - } - - output_sram[coreId] = sum; - __sync_all_ipu_within_cluster(); - __memcpy(nram_input, output_sram, coreDim * sizeof(T), SRAM2NRAM); - value_contain_infnan = - (nram_input[0] + nram_input[1] + nram_input[2] + nram_input[3]) > 0; -} - -template -__mlu_func__ void getConditionCoordWeight( - int32_t* data_offset_nram, T* weight_polation_nram, - T* cond_point_polation_nram, T* cond_point_valid_nram, T* loc_nram, - T* weight_attn_nram, int8_t* mask_x_nram, int8_t* mask_y_nram, - T* spatial_offset_bd_nram, T* spatial_w_bd_nram, T* spatial_h_bd_nram, - T* buf_nram, bool& w_contain_inf, const bool value_contain_infnan, - const int32_t deal_n, const int32_t num_levels, const int32_t num_points, - const int32_t num_heads, const int32_t channels) { - int32_t total_points = deal_n * num_levels * num_points; - int32_t block_points = num_levels * num_points; - T* buf_x_nram = buf_nram; - T* buf_y_nram = buf_nram + total_points; - T* buf_cond_nram = buf_nram + 2 * total_points; - T* buf_x_floor = buf_nram + 2 * total_points; - T* buf_x_ceil = buf_nram + 3 * total_points; - T* buf_y_floor = buf_nram + 4 * total_points; - T* buf_y_ceil = buf_nram + 5 * total_points; - //================================================================================================ - // if weight_attn_nram contain inf - int32_t inf_p = 0x7f7fffff; - int32_t inf_n = 0xff7fffff; - T inf_p_f = *((T*)&inf_p); - T inf_n_f = *((T*)&inf_n); - __bang_lt_scalar(buf_nram, weight_attn_nram, inf_n_f, total_points); - __bang_gt_scalar(buf_nram + total_points, weight_attn_nram, inf_p_f, - total_points); - __bang_sumpool(buf_nram + 2 * total_points, buf_nram, 1, 2 * total_points, 1, - 2 * total_points, 1, 1, 1); - w_contain_inf = buf_nram[2 * total_points] > 0; - //================================================================================================ - int32_t total_coord_pad = PAD_UP(total_points * 2, BIT_COLLECT_PAD); - __bang_collect_bitindex(buf_x_nram, loc_nram, mask_x_nram, total_coord_pad); - __bang_collect_bitindex(buf_y_nram, loc_nram, mask_y_nram, total_coord_pad); - // x = loc_x * spatial_w - 0.5; y = loc_y * spatial_h - 0.5; - __bang_fusion(FUSION_FMS, buf_x_nram, buf_x_nram, spatial_w_bd_nram, (T)0.5, - total_points, block_points); - __bang_fusion(FUSION_FMS, buf_y_nram, buf_y_nram, spatial_h_bd_nram, (T)0.5, - total_points, block_points); - //================================================================================================ - // get point condition. use buf0, buf1, buf2 - // (x > -1 && y > -1 && y < spatial_h && x < spatial_w) - __bang_gt_scalar(cond_point_valid_nram, buf_x_nram, (T)-1.0, total_points); - __bang_gt_scalar(buf_cond_nram, buf_y_nram, (T)-1.0, total_points); - __bang_and(cond_point_valid_nram, cond_point_valid_nram, buf_cond_nram, - total_points); - __bang_cycle_lt(buf_cond_nram, buf_x_nram, spatial_w_bd_nram, total_points, - block_points); - __bang_and(cond_point_valid_nram, cond_point_valid_nram, buf_cond_nram, - total_points); - __bang_cycle_lt(buf_cond_nram, buf_y_nram, spatial_h_bd_nram, total_points, - block_points); - __bang_and(cond_point_valid_nram, cond_point_valid_nram, buf_cond_nram, - total_points); - //================================================================================================ - __bang_floor(buf_x_floor, buf_x_nram, total_points); - __bang_add_scalar(buf_x_ceil, buf_x_floor, 1.0, total_points); - __bang_floor(buf_y_floor, buf_y_nram, total_points); - __bang_add_scalar(buf_y_ceil, buf_y_floor, 1.0, total_points); - T* cond_point_polation_nram_tl = cond_point_polation_nram; - T* cond_point_polation_nram_bl = cond_point_polation_nram + total_points; - T* cond_point_polation_nram_tr = cond_point_polation_nram + 2 * total_points; - T* cond_point_polation_nram_br = cond_point_polation_nram + 3 * total_points; - T* cond_point_polation_nram_cond1 = weight_polation_nram; - T* cond_point_polation_nram_cond2 = weight_polation_nram + total_points; - T* cond_point_polation_nram_cond3 = weight_polation_nram + 2 * total_points; - T* cond_point_polation_nram_cond4 = weight_polation_nram + 3 * total_points; - __bang_ge_scalar(cond_point_polation_nram_cond1, buf_x_floor, (T)0, - total_points); - __bang_cycle_lt(cond_point_polation_nram_cond2, buf_x_ceil, spatial_w_bd_nram, - total_points, block_points); - __bang_ge_scalar(cond_point_polation_nram_cond3, buf_y_floor, (T)0, - total_points); - __bang_cycle_lt(cond_point_polation_nram_cond4, buf_y_ceil, spatial_h_bd_nram, - total_points, block_points); - __bang_and(cond_point_polation_nram_tl, cond_point_polation_nram_cond1, - cond_point_polation_nram_cond4, total_points); - __bang_and(cond_point_polation_nram_bl, cond_point_polation_nram_cond1, - cond_point_polation_nram_cond3, total_points); - __bang_and(cond_point_polation_nram_tr, cond_point_polation_nram_cond2, - cond_point_polation_nram_cond4, total_points); - __bang_and(cond_point_polation_nram_br, cond_point_polation_nram_cond2, - cond_point_polation_nram_cond3, total_points); - //================================================================================================ - // get polation weight. - T* buf_dx = (T*)data_offset_nram; - T* buf_dy = buf_dx + total_points; - T* buf_dx_1 = buf_dy + total_points; - T* buf_dy_1 = buf_dx_1 + total_points; - // -dx = x_floor-x - // -dy = y_floor-y - // w1 = (1-dx)*dy = (dx-1)*(-dy) - // w2 = (1-dx)*(1-dy) = (dx-1)*(dy-1) - // w3 = dx*dy = (-dx)*(-dy) - // w4 = dx*(1-dy) = (-dx)*(dy-1) - T* weight_polation_nram_1 = weight_polation_nram; - T* weight_polation_nram_2 = weight_polation_nram + 1 * total_points; - T* weight_polation_nram_3 = weight_polation_nram + 2 * total_points; - T* weight_polation_nram_4 = weight_polation_nram + 3 * total_points; - // T* weight_polation_nram_buf = buf_nram + 4 * total_points; - __bang_sub(buf_dx, buf_x_floor, buf_x_nram, total_points); - __bang_sub(buf_dy, buf_y_floor, buf_y_nram, total_points); - __bang_fusion(FUSION_FSS, buf_dx_1, buf_x_nram, buf_x_floor, (T)1.0, - total_points, total_points); - __bang_fusion(FUSION_FSS, buf_dy_1, buf_y_nram, buf_y_floor, (T)1.0, - total_points, total_points); - __bang_mul(weight_polation_nram_1, buf_dx_1, buf_dy, total_points); - __bang_mul(weight_polation_nram_2, buf_dx_1, buf_dy_1, total_points); - __bang_mul(weight_polation_nram_3, buf_dx, buf_dy, total_points); - __bang_mul(weight_polation_nram_4, buf_dx, buf_dy_1, total_points); - //================================================================================================ - // correct the x,y in [0, w-1] and [0, h-1] - T* spatial_w1_bd_nram = buf_nram; - T* spatial_h1_bd_nram = buf_nram + block_points; - __bang_sub_scalar(spatial_w1_bd_nram, spatial_w_bd_nram, (T)1, block_points); - __bang_sub_scalar(spatial_h1_bd_nram, spatial_h_bd_nram, (T)1, block_points); - __bang_maxeq_scalar(buf_x_floor, buf_x_floor, (T)0, total_points); - __bang_maxeq_scalar(buf_x_ceil, buf_x_ceil, (T)0, total_points); - __bang_cycle_minequal(buf_x_floor, buf_x_floor, spatial_w1_bd_nram, - total_points, block_points); - __bang_cycle_minequal(buf_x_ceil, buf_x_ceil, spatial_w1_bd_nram, - total_points, block_points); - __bang_maxeq_scalar(buf_y_floor, buf_y_floor, (T)0, total_points); - __bang_maxeq_scalar(buf_y_ceil, buf_y_ceil, (T)0, total_points); - __bang_cycle_minequal(buf_y_floor, buf_y_floor, spatial_h1_bd_nram, - total_points, block_points); - __bang_cycle_minequal(buf_y_ceil, buf_y_ceil, spatial_h1_bd_nram, - total_points, block_points); - //================================================================================================ - // offset = y*w + x - T* buf_hw_offset = buf_nram; - T* data_offset_nram_tl = (T*)data_offset_nram; - T* data_offset_nram_bl = data_offset_nram_tl + total_points; - T* data_offset_nram_tr = data_offset_nram_bl + total_points; - T* data_offset_nram_br = data_offset_nram_tr + total_points; - // y_ceil*w + offset + x_floor - __bang_fusion(FUSION_FMA, buf_hw_offset, buf_y_ceil, spatial_w_bd_nram, - spatial_offset_bd_nram, total_points, block_points); - __bang_add(data_offset_nram_tl, buf_hw_offset, buf_x_floor, total_points); - // y_ceil*w + offset + x_ceil - __bang_add(data_offset_nram_tr, buf_hw_offset, buf_x_ceil, total_points); - // y_floor*w + offset + x_foor - __bang_fusion(FUSION_FMA, buf_hw_offset, buf_y_floor, spatial_w_bd_nram, - spatial_offset_bd_nram, total_points, block_points); - __bang_add(data_offset_nram_bl, buf_hw_offset, buf_x_floor, total_points); - // y_floor*w + offset + x_ceil - __bang_add(data_offset_nram_br, buf_hw_offset, buf_x_ceil, total_points); - //================================================================================================ - // merge and select conditions and weight - T* weight_polation_nram_tmp = (T*)buf_nram; - __bang_cycle_and(cond_point_polation_nram, cond_point_polation_nram, - cond_point_valid_nram, 4 * total_points, total_points); - if (!w_contain_inf) { - __bang_cycle_mul(weight_polation_nram, weight_polation_nram, - weight_attn_nram, 4 * total_points, total_points); - } - __bang_mul_scalar(buf_nram, weight_attn_nram, (T)1, total_points); - __bang_collect((float*)weight_attn_nram, (float*)buf_nram, - cond_point_valid_nram, total_points); - __bang_float2int32((int32_t*)cond_point_polation_nram, - cond_point_polation_nram, total_points * 4, 0); - __bang_mul_scalar((int32_t*)cond_point_polation_nram, - (int32_t*)cond_point_polation_nram, (int32_t)0xffffffff, - total_points * 4); - __bang_band((char*)weight_polation_nram_tmp, (char*)weight_polation_nram, - (char*)cond_point_polation_nram, - total_points * 4 * sizeof(float)); - __bang_collect((float*)weight_polation_nram, (float*)weight_polation_nram_tmp, - cond_point_valid_nram, total_points); - __bang_collect((float*)weight_polation_nram + total_points, - (float*)weight_polation_nram_tmp + total_points, - cond_point_valid_nram, total_points); - __bang_collect((float*)weight_polation_nram + 2 * total_points, - (float*)weight_polation_nram_tmp + 2 * total_points, - cond_point_valid_nram, total_points); - __bang_collect((float*)weight_polation_nram + 3 * total_points, - (float*)weight_polation_nram_tmp + 3 * total_points, - cond_point_valid_nram, total_points); - //================================================================================================ - // select cond_point_polation_nram if value_contain_infnan - if (value_contain_infnan) { - int32_t* cond_point_polation_nram_tmp = (int32_t*)buf_nram; - __bang_mul_scalar((int32_t*)cond_point_polation_nram_tmp, - (int32_t*)cond_point_polation_nram, (int32_t)1, - total_points * 4); - __bang_collect((float*)cond_point_polation_nram, - (float*)cond_point_polation_nram_tmp, cond_point_valid_nram, - total_points); - __bang_collect((float*)cond_point_polation_nram + total_points, - (float*)cond_point_polation_nram_tmp + total_points, - cond_point_valid_nram, total_points); - __bang_collect((float*)cond_point_polation_nram + 2 * total_points, - (float*)cond_point_polation_nram_tmp + 2 * total_points, - cond_point_valid_nram, total_points); - __bang_collect((float*)cond_point_polation_nram + 3 * total_points, - (float*)cond_point_polation_nram_tmp + 3 * total_points, - cond_point_valid_nram, total_points); - } - //================================================================================================ - // compute and select offset and stride - int32_t* data_offset_nram_tl_tmp = (int32_t*)buf_nram; - int32_t* data_offset_nram_bl_tmp = data_offset_nram_tl_tmp + total_points; - int32_t* data_offset_nram_tr_tmp = data_offset_nram_bl_tmp + total_points; - __bang_float2int32(data_offset_nram_tl_tmp, data_offset_nram_tl, - total_points * 4, 0); - int32_t stride = - SRAM_STAY ? channels * sizeof(T) : num_heads * channels * sizeof(T); - __bang_mul_scalar(data_offset_nram_tl_tmp, data_offset_nram_tl_tmp, stride, - total_points * 4); - __bang_sub((int32_t*)data_offset_nram_bl_tmp, - (int32_t*)data_offset_nram_bl_tmp, - (int32_t*)data_offset_nram_tl_tmp, total_points); - __bang_sub((int32_t*)data_offset_nram_tr_tmp, - (int32_t*)data_offset_nram_tr_tmp, - (int32_t*)data_offset_nram_tl_tmp, total_points); - __bang_collect((float*)data_offset_nram_tl, (float*)data_offset_nram_tl_tmp, - cond_point_valid_nram, total_points); - __bang_collect((float*)data_offset_nram_bl, (float*)data_offset_nram_bl_tmp, - cond_point_valid_nram, total_points); - __bang_collect((float*)data_offset_nram_tr, (float*)data_offset_nram_tr_tmp, - cond_point_valid_nram, total_points); -} - -/* - shape of each tensor: - output_nram: (channels) - input_nram: (4, valid_num, channels) - input_trans: (channels, 4, valid_num) - input_pooled: (channels, valid_num) - cond_selected_base: (4, deal_n, num_levels, num_points) - weight_selected_base: (4, deal_n, num_levels, num_points) - weight_attn_nram: (valid_num) - weight_compute: (4, valid_num) - cond_compute: (4, valid_num) - input_wram: (channels, 4 * valid_num) - - valid_num <= num_levels * num_points - sample_stride_3 = deal_n * num_levels * num_points - - Note: - If w_contain_inf is true, cannot merge attn_w and polation_w, so use sumpool - twice. If w_contain_inf is false, merge attn_w and polation_w and use matmul - instead. If value_contain_infnan is true, fill data_value of invalid - neighbors with 0. -*/ -template -__mlu_func__ void reduceLevelByConv( - T* output_nram, T* input_nram, T* input_trans, T* input_pooled, - int32_t* cond_selected_base, T* weight_selected_base, T* weight_attn_nram, - T* weight_compute, int32_t* cond_compute, T* input_wram, - const int32_t valid_num, const int32_t channels, - const int32_t sample_stride_3, const bool w_contain_inf, - const bool value_contain_infnan) { - if (valid_num > 0) { - int32_t ci = 4 * valid_num; - int32_t pad_ci = PAD_UP(ci, WRAM_ALIGN_SIZE / sizeof(T)); - int32_t co = channels; - int32_t pad_co = PAD_UP(co, LT_NUM); - if (value_contain_infnan) { - __memcpy_async(cond_compute, cond_selected_base, - valid_num * sizeof(int32_t), NRAM2NRAM, - valid_num * sizeof(T), sample_stride_3 * sizeof(T), 3); - } - __memcpy_async(weight_compute, weight_selected_base, valid_num * sizeof(T), - NRAM2NRAM, valid_num * sizeof(T), - sample_stride_3 * sizeof(T), 3); - __bang_transpose(input_trans, input_nram, ci, co); - __sync_move(); - - if (value_contain_infnan) { - __bang_cycle_band((char*)input_trans, (char*)input_trans, - (char*)cond_compute, co * ci * sizeof(T), - ci * sizeof(T)); - } - - if (w_contain_inf) { - __bang_cycle_mul(input_trans, input_trans, weight_compute, co * ci, ci); - __bang_sumpool(input_pooled, input_trans, valid_num, channels, 4, 1, 4, 1, - 1); - __bang_cycle_mul(input_pooled, input_pooled, weight_attn_nram, - channels * valid_num, valid_num); - __bang_sumpool(output_nram, input_pooled, 1, channels, valid_num, 1, - valid_num, 1, 1); - } else { - tileWeight2WramSync(input_wram, input_trans, co, ci, pad_co, pad_ci); - __bang_conv(output_nram, weight_compute, input_wram, ci, 1, 1, 1, 1, 1, 1, - co); - } - - } else { - __bang_write_value(output_nram, channels, (T)0); - } -} - -template -__mlu_func__ int32_t getReduceLevelByConvWramSize(const int32_t num_levels, - const int32_t num_points, - const int32_t channels) { - int32_t ci = 4 * num_levels * num_points; - int32_t pad_ci = PAD_UP(ci, WRAM_ALIGN_SIZE / sizeof(T)); - int32_t co = channels; - int32_t pad_co = PAD_UP(co, LT_NUM); - return pad_co * pad_ci * sizeof(T); -} - -__mlu_func__ void loadNram2Gpr(int32_t& v1, int32_t& v2, int32_t& v3, - int32_t* p1, int32_t* p2, int32_t* p3) { - v1 = __load_nram(p1); - v2 = __load_nram(p2); - v3 = __load_nram(p3); -} - -/* - Load 4 neighbors use one 3D-memcpy, just use offset of N1, stride_3_1 and - stride_2_1. - |<- stride_3_1 ->| - N1 N3 - ^ - | - stride_2_1 - | - v - N2 N4 - - Trickly fold the loop as 2. -*/ -template -__mlu_func__ void loadDataValueXram2NramAsync( - T* buf_value_nram_1, int32_t* offset_1, int32_t* stride_2_1, - int32_t* stride_3_1, T* value_src, const int32_t num_levels_points, - const int32_t channel_size, const int32_t value_stride_3_size) { - int32_t offset_1_a, stride_2_1_a, stride_3_1_a; - int32_t offset_1_b, stride_2_1_b, stride_3_1_b; - loadNram2Gpr(offset_1_a, stride_2_1_a, stride_3_1_a, offset_1, stride_2_1, - stride_3_1); - loadNram2Gpr(offset_1_b, stride_2_1_b, stride_3_1_b, offset_1 + 1, - stride_2_1 + 1, stride_3_1 + 1); - int32_t value_offset = 0; - int32_t next = 0; - int32_t loop_num = num_levels_points / 2; - int32_t remain = num_levels_points % 2; - int32_t data_value_stride = num_levels_points * channel_size; - for (int32_t j = 0; j < loop_num * 2; j += 2) { - value_offset = j * channel_size; - next = j + 2; - __memcpy_async((int8_t*)buf_value_nram_1 + value_offset, - (int8_t*)value_src + offset_1_a, channel_size, DIR, - 2 * data_value_stride, 1, data_value_stride, 1, stride_3_1_a, - 1, stride_2_1_a, 1); - - loadNram2Gpr(offset_1_a, stride_2_1_a, stride_3_1_a, offset_1 + next, - stride_2_1 + next, stride_3_1 + next); - - __memcpy_async((int8_t*)buf_value_nram_1 + value_offset + channel_size, - (int8_t*)value_src + offset_1_b, channel_size, DIR, - 2 * data_value_stride, 1, data_value_stride, 1, stride_3_1_b, - 1, stride_2_1_b, 1); - - loadNram2Gpr(offset_1_b, stride_2_1_b, stride_3_1_b, offset_1 + next + 1, - stride_2_1 + next + 1, stride_3_1 + next + 1); - } - - if (remain > 0) { - value_offset = loop_num * 2 * channel_size; - __memcpy_async((int8_t*)buf_value_nram_1 + value_offset, - (int8_t*)value_src + offset_1_a, channel_size, DIR, - 2 * data_value_stride, 1, data_value_stride, 1, stride_3_1_a, - 1, stride_2_1_a, 1); - } -} - -/* - use matmul to count valid samples. - sample_valid_count: (deal_n) - cond_point_valid_nram: (deal_n, num_levels, num_points) - nram_ones: (num_levels, num_points) -*/ -template -__mlu_func__ void countValidSamples(int32_t* sample_valid_count, - T* cond_point_valid_nram, T* nram_ones, - T* wram_buffer, int32_t num_levels, - int32_t num_points, int32_t deal_n) { - int32_t ci = num_levels * num_points; - int32_t pad_ci = PAD_UP(ci, WRAM_ALIGN_SIZE / sizeof(T)); - int32_t co = deal_n; - int32_t pad_co = PAD_UP(co, LT_NUM); - tileWeight2WramSync(wram_buffer, cond_point_valid_nram, co, ci, pad_co, - pad_ci); - __bang_conv((T*)sample_valid_count, nram_ones, wram_buffer, ci, 1, 1, 1, 1, 1, - 1, co); - __bang_float2int32(sample_valid_count, (T*)sample_valid_count, deal_n, 0); -} - -template -__mlu_func__ void loadNeighborPolationAttn( - T* value_output_nram, T* value_sram, T* value_gdram, - int32_t* data_offset_nram, T* weight_polation_nram, - T* cond_point_polation_nram, T* cond_point_valid_nram, T* weight_attn_nram, - T* buf_nram, T* compute_buf_nram, T* nram_ones, const int32_t deal_n, - const int32_t num_levels, const int32_t num_points, const int32_t num_keys, - const int32_t channels, const bool w_contain_inf, - const bool value_contain_infnan) { - int32_t channel_size = channels * sizeof(T); - int32_t sample_stride_3 = deal_n * num_levels * num_points; - int32_t value_stride_3 = num_levels * num_points * channels; - int32_t value_stride_3_size = value_stride_3 * sizeof(T); - T* buf_value_nram = buf_nram; // (4, num_levels, num_points, channels) - T* buf_value_nram_trans = - buf_nram + 4 * value_stride_3; // (4, num_levels, num_points, channels) - T* buf_value_nram_pool = - buf_nram + 8 * value_stride_3; // (1, num_levels, num_points, channels) - int32_t* sample_valid_count = - (int32_t*)(buf_nram + 9 * value_stride_3); // (deal_n) - T* weight_compute_nram = compute_buf_nram; // (4, num_levels, num_points) - int32_t* cond_compute_nram = - (int32_t*)(weight_compute_nram + 4 * num_levels * num_points); - - countValidSamples(sample_valid_count, cond_point_valid_nram, nram_ones, - (T*)wram_buffer, num_levels, num_points, deal_n); - __sync_compute(); - - int32_t* offset = data_offset_nram; - int32_t* stride_2_1 = offset + sample_stride_3; - int32_t* stride_3_1 = stride_2_1 + sample_stride_3; - T* output_nram = value_output_nram; - int32_t step_offset = 0; - T* value_src = SRAM_STAY ? value_sram : value_gdram; - for (int32_t i = 0; i < deal_n; i++) { - int32_t valid_num = sample_valid_count[i]; - if (SRAM_STAY) { - loadDataValueXram2NramAsync( - buf_value_nram, offset, stride_2_1, stride_3_1, value_src, valid_num, - channel_size, value_stride_3_size); - __sync_move(); - } else { - loadDataValueXram2NramAsync( - buf_value_nram, offset, stride_2_1, stride_3_1, value_src, valid_num, - channel_size, value_stride_3_size); - __sync_io(); - } - reduceLevelByConv( - output_nram, buf_value_nram, buf_value_nram_trans, buf_value_nram_pool, - (int32_t*)cond_point_polation_nram + step_offset, - weight_polation_nram + step_offset, weight_attn_nram + step_offset, - weight_compute_nram, cond_compute_nram, (T*)wram_buffer, valid_num, - channels, sample_stride_3, w_contain_inf, value_contain_infnan); - step_offset += valid_num; - offset = data_offset_nram + step_offset; - stride_2_1 = offset + sample_stride_3; - stride_3_1 = stride_2_1 + sample_stride_3; - output_nram += channels; - } -} - -template -__mlu_func__ void prepareLoop( - T* ones_nram, int32_t* spatial_offset_nram, int32_t* spatial_hw_nram, - int8_t* mask_x_nram, int8_t* mask_y_nram, T* spatial_offset_bd_nram, - T* spatial_h_bd_nram, T* spatial_w_bd_nram, T* value_sram, - const char* data_level_start_index_gdram, - const char* data_spatial_shapes_gdram, const int32_t num_keys, - const int32_t num_levels, const int32_t num_points, - const int32_t max_deal_n, const int32_t mask_size, const int32_t channels) { - int32_t pad_num_points_levels = - PAD_UP(num_levels * num_points, WRAM_ALIGN_SIZE / sizeof(T)); - __bang_write_value(ones_nram, pad_num_points_levels, (T)0); - __bang_write_value(ones_nram, num_levels * num_points, (T)1); - __bang_write_value(mask_x_nram, mask_size, (char)0x55); - __bang_write_value(mask_y_nram, mask_size, (char)0xAA); - __memcpy_async(spatial_offset_nram, data_level_start_index_gdram, - num_levels * sizeof(int32_t), GDRAM2NRAM); - __memcpy_async(spatial_hw_nram, data_spatial_shapes_gdram, - num_levels * 2 * sizeof(int32_t), GDRAM2NRAM); - __sync_io_move_compute(); - broadcastSpatialHW(spatial_offset_bd_nram, spatial_h_bd_nram, - spatial_w_bd_nram, spatial_hw_nram, spatial_offset_nram, - num_levels, num_points); -} - -template -__mlu_func__ void loadDataValueGdram2Sram(T* value_sram, T* data_value_gdram, - const int32_t batch_idx, - const int32_t head_idx, - const int32_t num_keys, - const int32_t num_heads, - const int32_t channels) { - int32_t loop_num = (num_keys + MAX_MEMCPY_SEGNUM - 1) / MAX_MEMCPY_SEGNUM; - int32_t num_heads_channels = num_heads * channels; - for (int32_t i = 0; i < loop_num; i++) { - int32_t load_num = - __mluop_min(MAX_MEMCPY_SEGNUM, num_keys - i * MAX_MEMCPY_SEGNUM); - size_t src_offset = ((size_t)batch_idx * num_keys + i * MAX_MEMCPY_SEGNUM) * - num_heads_channels + - head_idx * channels; - int32_t dst_offset = i * MAX_MEMCPY_SEGNUM * channels; - __memcpy(value_sram + dst_offset, (T*)data_value_gdram + src_offset, - channels * sizeof(T), GDRAM2SRAM, channels * sizeof(T), - num_heads_channels * sizeof(T), load_num - 1); - } -} - -/* - The shape of each tensor: - ones_nram: (num_levels, num_points) - buf_compute_nram: (8, num_levels, num_points) - spatial_offset_nram: (num_levels) - spatial_hw_nram: (num_levels, 2) - spatial_offset_bd_nram: (num_levels, num_points) - spatial_w_bd_nram: (num_levels, num_points) - spatial_h_bd_nram: (num_levels, num_points) - mask_x_nram: (deal_n, num_levels, num_points, 2) / 8 - mask_y_nram: (deal_n, num_levels, num_points, 2) / 8 - value_output_nram: (deal_n, channels) - data_offset_nram: (4, deal_n, num_levels, num_points) - weight_polation_nram: (4, deal_n, num_levels, num_points) - cond_point_polation_nram: (4, deal_n, num_levels, num_points) - cond_point_valid_nram: (deal_n, num_levels, num_points) - loc_nram: (deal_n, num_levels, num_points, 2) - weight_attn_nram: (deal_n, num_levels, num_points) - buf_nram: (6, deal_n, num_levels, num_points) - - Note: buf_nram is reused in polation computing. -*/ -template -__mlu_func__ void memPolicyCommon( - T*& buf_compute_nram, T*& ones_nram, T*& value_output_nram, - int32_t*& data_offset_nram, T*& weight_polation_nram, - T*& cond_point_polation_nram, T*& cond_point_valid_nram, T*& loc_nram, - T*& weight_attn_nram, T*& buf_nram, T*& buf_nram_end, int8_t*& mask_x_nram, - int8_t*& mask_y_nram, T*& spatial_offset_bd_nram, T*& spatial_w_bd_nram, - T*& spatial_h_bd_nram, int32_t*& spatial_offset_nram, - int32_t*& spatial_hw_nram, T*& value_sram, int32_t& max_deal_n, - int32_t& mask_size, const int32_t batch_size, const int32_t num_keys, - const int32_t num_heads, const int32_t channels, const int32_t num_levels, - const int32_t num_queries, const int32_t num_points) { - int32_t num_points_levels = num_levels * num_points; - int32_t pad_num_points_levels = - PAD_UP(num_points_levels, WRAM_ALIGN_SIZE / sizeof(T)); - int32_t pad_num_points_levels_8 = - PAD_UP(8 * num_points_levels, WRAM_ALIGN_SIZE / sizeof(T)); - int32_t spatial_info_size = - PAD_UP(3 * num_levels * sizeof(int32_t), NFU_ALIGN_SIZE); - int32_t fix_space_size = - spatial_info_size + 2 * BIT_COLLECT_PAD * sizeof(T) + - (4 * pad_num_points_levels + pad_num_points_levels_8) * sizeof(T); - int32_t left_space_size = NRAM_AVALIABLE_SIZE - fix_space_size; - int32_t common_buffer_size_each = 6 * num_points_levels * sizeof(T); - int32_t inter_result_size_each = - 17 * num_points_levels * sizeof(T) + channels * sizeof(T); - - max_deal_n = - left_space_size / (common_buffer_size_each + inter_result_size_each); - int32_t compute_buffer_size = - (9 * num_points_levels * channels + max_deal_n) * sizeof(T); - int32_t common_buffer_size = max_deal_n * common_buffer_size_each; - // make sure buf_nram is large enough for compute - if (compute_buffer_size > common_buffer_size) { - int32_t tmp_deal_n = - (left_space_size - compute_buffer_size) / inter_result_size_each; - max_deal_n = __mluop_min(max_deal_n, tmp_deal_n); - } - - int32_t reduce_need_wram_size = - getReduceLevelByConvWramSize(num_levels, num_points, channels); - int32_t count_valid_max = - PAD_DOWN(WRAM_AVALIABLE_SIZE / sizeof(T) / pad_num_points_levels, LT_NUM); - int32_t wram_deal_n = - (int)(reduce_need_wram_size <= WRAM_AVALIABLE_SIZE) * count_valid_max; - max_deal_n = __mluop_min(max_deal_n, wram_deal_n); - - int32_t total_points = max_deal_n * num_points_levels; - int32_t total_coord_pad = PAD_UP(total_points * 2, BIT_COLLECT_PAD); - mask_size = total_coord_pad / BIT_COLLECT_PAD; - ones_nram = (T*)nram_buffer; - buf_compute_nram = ones_nram + pad_num_points_levels; - spatial_offset_nram = (int32_t*)(buf_compute_nram + pad_num_points_levels_8); - spatial_hw_nram = spatial_offset_nram + num_levels; - spatial_offset_bd_nram = (T*)(spatial_hw_nram + num_levels * 2); - spatial_w_bd_nram = spatial_offset_bd_nram + num_points_levels; - spatial_h_bd_nram = spatial_w_bd_nram + num_points_levels; - mask_x_nram = (int8_t*)(spatial_h_bd_nram + num_points_levels); - mask_y_nram = mask_x_nram + mask_size; - value_output_nram = (T*)(mask_y_nram + mask_size); - data_offset_nram = (int32_t*)(value_output_nram + max_deal_n * channels); - weight_polation_nram = (T*)(data_offset_nram + 4 * total_points); - cond_point_polation_nram = weight_polation_nram + 4 * total_points; - cond_point_valid_nram = cond_point_polation_nram + 4 * total_points; - loc_nram = cond_point_valid_nram + total_points; - weight_attn_nram = loc_nram + total_coord_pad; - buf_nram = weight_attn_nram + total_points; - buf_nram_end = buf_nram + 6 * max_deal_n * num_points_levels; - value_sram = (T*)sram_buffer; -} - -template -__mlu_func__ void MLUKernelMsDeformAttnForwardFastImpl( - const char* data_value_gdram, const char* data_spatial_shapes_gdram, - const char* data_level_start_index_gdram, - const char* data_sampling_loc_gdram, const char* data_attn_weight_gdram, - const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, const int32_t num_queries, - const int32_t num_points, char* data_col_gdram) { - int32_t input_stride_4 = num_queries * num_heads * num_levels * num_points; - int32_t input_stride_3 = num_heads * num_levels * num_points; - int32_t input_stride_2 = num_levels * num_points; - int32_t output_stride_3 = num_queries * num_heads * channels; - int32_t output_stride_2 = num_heads * channels; - int32_t data_value_stride_3 = num_keys * num_heads * channels; - constexpr bool sram_stay = (POLICY == 0); - - T* value_output_nram = nullptr; // (deal_n, channels) - int32_t* data_offset_nram = nullptr; // (4, deal_n, num_levels, num_points) - T* weight_polation_nram = nullptr; // (4, deal_n, num_levels, num_points) - T* cond_point_polation_nram = nullptr; // (4, deal_n, num_levels, num_points) - T* cond_point_valid_nram = nullptr; // (deal_n, num_levels, num_points) - T* loc_nram = nullptr; // (deal_n, num_levels, num_points, 2) - T* weight_attn_nram = nullptr; // (deal_n, num_levels, num_points) - T* buf_nram = nullptr; // (6, deal_n, num_levels, num_points) - T* buf_nram_end = nullptr; - int8_t* mask_x_nram = nullptr; // (deal_n, num_levels, num_points, 2) / 8 - int8_t* mask_y_nram = nullptr; // (deal_n, num_levels, num_points, 2) / 8 - T* spatial_offset_bd_nram = nullptr; // (num_levels, num_points) - T* spatial_w_bd_nram = nullptr; // (num_levels, num_points) - T* spatial_h_bd_nram = nullptr; // (num_levels, num_points) - int32_t* spatial_offset_nram = nullptr; // (num_levels) - int32_t* spatial_hw_nram = nullptr; // (num_levels, 2) - T* buf_compute_nram = nullptr; // (8, num_levels, num_points) - T* ones_nram = nullptr; // (1, num_levels, num_points) - T* value_sram = nullptr; // (num_keys, channels) - int32_t max_deal_n = 0; - int32_t mask_size = 0; - memPolicyCommon(buf_compute_nram, ones_nram, value_output_nram, - data_offset_nram, weight_polation_nram, - cond_point_polation_nram, cond_point_valid_nram, loc_nram, - weight_attn_nram, buf_nram, buf_nram_end, mask_x_nram, - mask_y_nram, spatial_offset_bd_nram, spatial_w_bd_nram, - spatial_h_bd_nram, spatial_offset_nram, spatial_hw_nram, - value_sram, max_deal_n, mask_size, batch_size, num_keys, - num_heads, channels, num_levels, num_queries, num_points); - if (max_deal_n <= 0) { - return; - } - - // split batch*head into taskDimY - int32_t batch_head = batch_size * num_heads; - int32_t cluster_avg_batch_head = (batch_head + taskDimY - 1) / taskDimY; - int32_t cluster_begin_batch_head = taskIdY * cluster_avg_batch_head; - int32_t cluster_act_batch_head = __mluop_min( - cluster_avg_batch_head, batch_head - cluster_begin_batch_head); - int32_t cluster_end_batch_head = - cluster_begin_batch_head + cluster_act_batch_head; - // split query into coreDim - int32_t core_avg_query = (num_queries + coreDim - 1) / coreDim; - int32_t core_begin_query = coreId * core_avg_query; - int32_t core_act_query = - __mluop_min(num_queries - core_begin_query, core_avg_query); - int32_t core_loop_num = (core_act_query + max_deal_n - 1) / max_deal_n; - int32_t core_step_query = - core_loop_num > 0 ? (core_act_query + core_loop_num - 1) / core_loop_num - : 0; - int32_t core_remain_query = - core_act_query - (core_loop_num - 1) * core_step_query; - int32_t first_deal_query = - (int)(core_loop_num > 0) * - (core_loop_num > 1 ? core_step_query : core_remain_query); - - prepareLoop(ones_nram, spatial_offset_nram, spatial_hw_nram, mask_x_nram, - mask_y_nram, spatial_offset_bd_nram, spatial_h_bd_nram, - spatial_w_bd_nram, value_sram, data_level_start_index_gdram, - data_spatial_shapes_gdram, num_keys, num_levels, num_points, - max_deal_n, mask_size, channels); - - for (int32_t bh_idx = cluster_begin_batch_head; - bh_idx < cluster_end_batch_head; bh_idx++) { - int32_t b = bh_idx / num_heads; - int32_t head_idx = bh_idx % num_heads; - bool w_contain_inf = false; - bool value_contain_infnan = true; - - size_t output_base_offset = - (size_t)b * output_stride_3 + head_idx * channels; - int32_t attn_weight_base_offset = - b * input_stride_4 + head_idx * input_stride_2; - - if (sram_stay && __is_mpu()) { - loadDataValueGdram2Sram(value_sram, (T*)data_value_gdram, b, head_idx, - num_keys, num_heads, channels); - } - __sync_cluster(); - - if (__is_ipu()) { - if (sram_stay) { - int32_t buf_size = - (int)((char*)buf_nram_end - (char*)value_output_nram); - isValueContainInfNan(value_sram, value_sram + num_keys * channels, - value_output_nram, value_contain_infnan, buf_size, - num_keys * channels); - } - // compute weight, offset and condition - int32_t attn_weight_offset = - attn_weight_base_offset + core_begin_query * input_stride_3; - int32_t loc_offset = attn_weight_offset * 2; - if (first_deal_query > 0) { - __memcpy(loc_nram, (T*)data_sampling_loc_gdram + loc_offset, - input_stride_2 * 2 * sizeof(T), GDRAM2NRAM, - input_stride_2 * 2 * sizeof(T), input_stride_3 * 2 * sizeof(T), - first_deal_query - 1); - __memcpy( - weight_attn_nram, (T*)data_attn_weight_gdram + attn_weight_offset, - input_stride_2 * sizeof(T), GDRAM2NRAM, input_stride_2 * sizeof(T), - input_stride_3 * sizeof(T), first_deal_query - 1); - getConditionCoordWeight( - data_offset_nram, weight_polation_nram, cond_point_polation_nram, - cond_point_valid_nram, loc_nram, weight_attn_nram, mask_x_nram, - mask_y_nram, spatial_offset_bd_nram, spatial_w_bd_nram, - spatial_h_bd_nram, buf_nram, w_contain_inf, value_contain_infnan, - first_deal_query, num_levels, num_points, num_heads, channels); - } - } - - for (int32_t i = 0; __is_ipu() && i < core_loop_num; i++) { - int32_t deal_n = - i < core_loop_num - 1 ? core_step_query : core_remain_query; - int32_t load_n = - i < core_loop_num - 2 ? core_step_query : core_remain_query; - // load value and polation - loadNeighborPolationAttn( - value_output_nram, value_sram, - (T*)data_value_gdram + b * data_value_stride_3 + head_idx * channels, - data_offset_nram, weight_polation_nram, cond_point_polation_nram, - cond_point_valid_nram, weight_attn_nram, buf_nram, buf_compute_nram, - ones_nram, deal_n, num_levels, num_points, num_keys, channels, - w_contain_inf, value_contain_infnan); - __sync_io_move_compute(); - // load next weight and loc - if (i < core_loop_num - 1) { - int32_t core_query_offset = (i + 1) * core_step_query; - int32_t attn_weight_offset = - attn_weight_base_offset + - (core_begin_query + core_query_offset) * input_stride_3; - int32_t loc_offset = attn_weight_offset * 2; - __memcpy_async(loc_nram, (T*)data_sampling_loc_gdram + loc_offset, - input_stride_2 * 2 * sizeof(T), GDRAM2NRAM, - input_stride_2 * 2 * sizeof(T), - input_stride_3 * 2 * sizeof(T), load_n - 1); - __memcpy_async( - weight_attn_nram, (T*)data_attn_weight_gdram + attn_weight_offset, - input_stride_2 * sizeof(T), GDRAM2NRAM, input_stride_2 * sizeof(T), - input_stride_3 * sizeof(T), load_n - 1); - __sync_io_move_compute(); - } - // store result - size_t output_offset = - ((size_t)core_begin_query + i * core_step_query) * output_stride_2; - __memcpy_async((T*)data_col_gdram + output_base_offset + output_offset, - value_output_nram, channels * sizeof(T), NRAM2GDRAM, - output_stride_2 * sizeof(T), channels * sizeof(T), - deal_n - 1); - - // compute cond/weight/offset - if (i < core_loop_num - 1) { - getConditionCoordWeight( - data_offset_nram, weight_polation_nram, cond_point_polation_nram, - cond_point_valid_nram, loc_nram, weight_attn_nram, mask_x_nram, - mask_y_nram, spatial_offset_bd_nram, spatial_w_bd_nram, - spatial_h_bd_nram, buf_nram, w_contain_inf, value_contain_infnan, - load_n, num_levels, num_points, num_heads, channels); - } - __sync_io_move_compute(); - } - __sync_cluster(); - } -} -#endif - -#if (__BANG_ARCH__ == 592) - -/* - The shape of each tensor on nram: - spatial_offset_nram: (num_levels) - spatial_hw_nram: (num_levels, 2) - spatial_offset_bd_nram: (num_levels, num_points) - spatial_w_bd_nram: (num_levels, num_points) - spatial_h_bd_nram: (num_levels, num_points) - mask_x_nram: (deal_n, num_levels, num_points, 2) / 8 - mask_y_nram: (deal_n, num_levels, num_points, 2) / 8 - data_offset_nram: (4, deal_n, num_levels, num_points) - weight_polation_nram: (4, deal_n, num_levels, num_points) - cond_point_polation_nram: (4, deal_n, num_levels, num_points) - cond_point_valid_nram: (deal_n, num_levels, num_points) - loc_nram: (deal_n, num_levels, num_points, 2) - buf_nram: (6, deal_n, num_levels, num_points) - - The shape of each tensor on sram: - data_offset_nram: (4, deal_n, num_levels, num_points) - weight_polation_nram: (4, deal_n, num_levels, num_points) - cond_point_polation_nram: (4, deal_n, num_levels, num_points) / 8 - cond_point_valid_nram: (deal_n, num_levels, num_points) -*/ -template -__mlu_func__ void memPolicy590( - T*& zeros_nram, int32_t*& data_offset_nram, T*& weight_polation_nram, - T*& cond_point_polation_nram, T*& cond_point_valid_nram, T*& loc_nram, - T*& buf_nram, T*& buf_nram_end, int8_t*& mask_x_nram, int8_t*& mask_y_nram, - T*& spatial_offset_bd_nram, T*& spatial_w_bd_nram, T*& spatial_h_bd_nram, - int32_t*& spatial_offset_nram, int32_t*& spatial_hw_nram, T*& value_ping, - T*& value_pong, T*& compute_buffer, T*& weight_polation_nram_stg2, - T*& weight_attn_nram_stg2, int32_t*& offset_nram_stg2, T*& output_nram, - T*& cond_nram_stg2, int32_t*& data_offset_sram, T*& weight_polation_sram, - T*& weight_attn_sram, T*& cond_point_polation_sram, char* nram_buffer, - char* sram_buffer, int32_t& max_cached_n, int32_t& stage_1_max_deal_n, - int32_t& stage_2_max_deal_n, int32_t& mask_size, - const int32_t nram_avaliable_size, const int32_t sram_avaliable_size, - const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, const int32_t num_queries, - const int32_t num_points) { - int32_t num_points_levels = num_levels * num_points; - int32_t spatial_info_size = - PAD_UP(3 * num_levels * sizeof(int32_t), WRAM_ALIGN_SIZE); - int32_t spatial_info_bd_size = - PAD_UP(3 * num_points_levels * sizeof(T), WRAM_ALIGN_SIZE); - int32_t zeros_size = PAD_UP(channels * sizeof(T), WRAM_ALIGN_SIZE); - int32_t fix_space_size = spatial_info_size + 2 * BIT_COLLECT_PAD * sizeof(T) + - spatial_info_bd_size + zeros_size; - int32_t left_space_size = nram_avaliable_size - fix_space_size; - stage_1_max_deal_n = left_space_size / (20 * num_points_levels * sizeof(T)); - int32_t total_points = stage_1_max_deal_n * num_points_levels; - int32_t total_coord_pad = PAD_UP(total_points * 2, BIT_COLLECT_PAD); - mask_size = PAD_UP(total_coord_pad / BIT_COLLECT_PAD, WRAM_ALIGN_SIZE); - stage_2_max_deal_n = - (left_space_size - 2 * mask_size) / - ((12 * num_points_levels * channels + 17 * num_points_levels) * - sizeof(T)); - // fix nram space - zeros_nram = (T*)(nram_buffer); - spatial_offset_nram = (int32_t*)(zeros_nram + zeros_size / sizeof(T)); - spatial_hw_nram = spatial_offset_nram + num_levels; - spatial_offset_bd_nram = - (T*)((int8_t*)spatial_offset_nram + spatial_info_size); - spatial_w_bd_nram = spatial_offset_bd_nram + num_points_levels; - spatial_h_bd_nram = spatial_w_bd_nram + num_points_levels; - mask_x_nram = (int8_t*)spatial_offset_bd_nram + spatial_info_bd_size; - mask_y_nram = mask_x_nram + mask_size; - // stage1 nram space - // 4 + 4 + 4 + 1 + 6 - data_offset_nram = (int32_t*)(mask_y_nram + mask_size); - weight_polation_nram = (T*)(data_offset_nram + 4 * total_points); - cond_point_polation_nram = weight_polation_nram + 4 * total_points; - cond_point_valid_nram = cond_point_polation_nram + 4 * total_points; - buf_nram = cond_point_valid_nram + total_points; - loc_nram = buf_nram + 4 * total_points; - buf_nram_end = buf_nram + 6 * total_points + total_coord_pad; - // stage2 nram space - int32_t total_points_stg2 = stage_2_max_deal_n * num_points_levels; - cond_nram_stg2 = (T*)(mask_y_nram + mask_size); - value_ping = cond_nram_stg2 + 4 * total_points_stg2 + BIT_COLLECT_PAD; - value_pong = value_ping + 4 * total_points_stg2 * channels; - compute_buffer = value_pong + 4 * total_points_stg2 * channels; - weight_polation_nram_stg2 = compute_buffer + 4 * total_points_stg2 * channels; - weight_attn_nram_stg2 = weight_polation_nram_stg2 + 4 * total_points_stg2; - offset_nram_stg2 = (int32_t*)(weight_attn_nram_stg2 + total_points_stg2); - // sram space: 4 + 4 + 1 + 4 - int32_t polation_info_size = 13 * num_points_levels * sizeof(T); - int32_t avg_sram_size = sram_avaliable_size / coreDim; - max_cached_n = avg_sram_size / polation_info_size; - int max_cached_points = max_cached_n * num_points_levels; - T* sram_buf_base = (T*)(sram_buffer + avg_sram_size * coreId); - data_offset_sram = (int32_t*)sram_buf_base; - weight_polation_sram = (T*)(data_offset_sram + 4 * max_cached_points); - weight_attn_sram = (T*)(weight_polation_sram + 4 * max_cached_points); - cond_point_polation_sram = (T*)(weight_attn_sram + max_cached_points); -} - -template -__mlu_func__ void forwardStageTwoLoop( - T* value_ping_nram, T* value_pong_nram, T* compute_buffer_nram, - T* zeros_nram, T* weight_polation_nram_stg2, T* weight_attn_nram_stg2, - int32_t* offset_nram_stg2, T* output_nram, T* cond_nram_stg2, - int32_t* data_offset_sram, T* weight_polation_sram, T* weight_attn_sram, - T* cond_point_polation_sram, T* data_value_gdram, T* weight_attn_gdram, - T* output_gdram, const int32_t total_deal_n, const int32_t max_deal_n, - const int32_t input_stride_2, const int32_t input_stride_3, - const int32_t output_stride_2, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, - const int32_t num_points) { - int32_t loop_num = (total_deal_n + max_deal_n - 1) / max_deal_n; - int32_t num_levels_points = num_levels * num_points; - int32_t sram_src_stride = total_deal_n * num_levels_points * sizeof(T); - T* value_nram[2] = {value_ping_nram, value_pong_nram}; - int32_t* offset_zero_nram_stg2 = - offset_nram_stg2 + 4 * max_deal_n * num_levels_points; - for (int32_t i = 0; i < loop_num + 1; i++) { - int32_t compute_idx = i - 1; - int32_t compute_offset = compute_idx * max_deal_n; - int32_t load_n = std::min(total_deal_n - i * max_deal_n, max_deal_n); - int32_t compute_n = - std::min(total_deal_n - compute_idx * max_deal_n, max_deal_n); - int32_t load_point_num = 4 * load_n * num_levels_points; - int32_t nq_nlp_4 = compute_n * num_levels_points * 4; - int32_t nq_nlp = compute_n * num_levels_points; - - int32_t total_point_pad_8 = PAD_UP(load_point_num, BIT_COLLECT_PAD); - int32_t gather_mask_size = total_point_pad_8 / BIT_COLLECT_PAD; - T* v_compute = value_nram[compute_idx % 2]; - T* v_load = value_nram[i % 2]; - int8_t* cond_nram_stg2_reverse = (int8_t*)cond_nram_stg2 + gather_mask_size; - - if (i > 0) { - int32_t copy_size_1 = compute_n * num_levels_points * sizeof(T); - int32_t sram_src_offset = compute_idx * max_deal_n * num_levels_points; - __memcpy_async(weight_polation_nram_stg2, - weight_polation_sram + sram_src_offset, copy_size_1, - SRAM2NRAM, copy_size_1, sram_src_stride, 3); - __memcpy_async(weight_attn_nram_stg2, weight_attn_sram + sram_src_offset, - copy_size_1, SRAM2NRAM); - } - - if (i < loop_num) { - int32_t copy_size_1 = load_n * num_levels_points * sizeof(T); - int32_t copy_size_2 = load_n * num_levels_points * sizeof(int32_t); - int32_t sram_src_offset = i * max_deal_n * num_levels_points; - __memcpy_async(offset_nram_stg2, data_offset_sram + sram_src_offset, - copy_size_2, SRAM2NRAM, copy_size_2, sram_src_stride, 3); - __memcpy_async(cond_nram_stg2, cond_point_polation_sram + sram_src_offset, - copy_size_1, SRAM2NRAM, copy_size_1, sram_src_stride, 3); - __bang_write_value(compute_buffer_nram, load_point_num, (T)0); - __bang_write_value(offset_zero_nram_stg2, load_point_num, (int32_t)0); - __sync_move(); - __bang_gt_bitindex(cond_nram_stg2, cond_nram_stg2, compute_buffer_nram, - total_point_pad_8); - __bang_bnot((char*)cond_nram_stg2_reverse, (char*)cond_nram_stg2, - gather_mask_size); - } - - __sync_io_move_compute(); - - if (i < loop_num) { - gatherAsync(v_load, zeros_nram, (unsigned int*)offset_zero_nram_stg2, - cond_nram_stg2_reverse, channels * sizeof(T), NRAM2NRAM, - channels * sizeof(T), load_point_num); - gatherAsync(v_load, data_value_gdram, (unsigned int*)offset_nram_stg2, - cond_nram_stg2, channels * sizeof(T), GDRAM2NRAM, - channels * sizeof(T), load_point_num); - } - - if (i > 0) { - __bang_transpose(compute_buffer_nram, v_compute, nq_nlp_4, channels); - __bang_cycle_mul(compute_buffer_nram, compute_buffer_nram, - weight_polation_nram_stg2, channels * nq_nlp_4, - nq_nlp_4); - __bang_sumpool(v_compute, compute_buffer_nram, nq_nlp, channels, 4, 1, 4, - 1, 1); - __bang_cycle_mul(v_compute, v_compute, weight_attn_nram_stg2, - channels * nq_nlp, nq_nlp); - __bang_transpose(compute_buffer_nram, v_compute, channels, nq_nlp); - __bang_sumpool(v_compute, compute_buffer_nram, channels, compute_n, - num_levels_points, 1, num_levels_points, 1, 1); - __memcpy(output_gdram + compute_offset * output_stride_2, v_compute, - channels * sizeof(T), NRAM2GDRAM, output_stride_2 * sizeof(T), - channels * sizeof(T), compute_n - 1); - } - __sync_io_move_compute(); - } -} - -// only for 590 -template -__mlu_func__ void MLUKernelMsDeformAttnForwardFastImpl( - const char* data_value_gdram, const char* data_spatial_shapes_gdram, - const char* data_level_start_index_gdram, - const char* data_sampling_loc_gdram, const char* data_attn_weight_gdram, - const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, const int32_t num_queries, - const int32_t num_points, char* data_col_gdram) { - int32_t input_stride_4 = num_queries * num_heads * num_levels * num_points; - int32_t input_stride_3 = num_heads * num_levels * num_points; - int32_t input_stride_2 = num_levels * num_points; - int32_t output_stride_3 = num_queries * num_heads * channels; - int32_t output_stride_2 = num_heads * channels; - int32_t data_value_stride_3 = num_keys * num_heads * channels; - - T* zeros_nram = nullptr; // (channels) - int32_t* data_offset_nram = nullptr; // (4, deal_n, num_levels, num_points) - T* weight_polation_nram = nullptr; // (4, deal_n, num_levels, num_points) - T* cond_point_polation_nram = nullptr; // (4, deal_n, num_levels, num_points) - T* cond_point_valid_nram = nullptr; // (deal_n, num_levels, num_points) - T* loc_nram = nullptr; // (deal_n, num_levels, num_points, 2) - T* buf_nram = nullptr; // (6, deal_n, num_levels, num_points) - T* buf_nram_end = nullptr; - int8_t* mask_x_nram = nullptr; // (deal_n, num_levels, num_points, 2) / 8 - int8_t* mask_y_nram = nullptr; // (deal_n, num_levels, num_points, 2) / 8 - T* spatial_offset_bd_nram = nullptr; // (num_levels, num_points) - T* spatial_w_bd_nram = nullptr; // (num_levels, num_points) - T* spatial_h_bd_nram = nullptr; // (num_levels, num_points) - int32_t* spatial_offset_nram = nullptr; // (num_levels) - int32_t* spatial_hw_nram = nullptr; // (num_levels, 2) - T* value_ping_nram = nullptr; // (deal_n, num_levels, num_points, channels) - T* value_pong_nram = nullptr; // (deal_n, num_levels, num_points, channels) - T* compute_buffer_nram = - nullptr; // (deal_n, num_levels, num_points, channels) - T* weight_polation_nram_stg2 = - nullptr; // (4, deal_n, num_levels, num_points) - T* weight_attn_nram_stg2 = nullptr; // (1, deal_n, num_levels, num_points) - int32_t* offset_nram_stg2 = nullptr; // (4, deal_n, num_levels, num_points) - T* output_nram = nullptr; // (deal_n, channels) - T* cond_nram_stg2 = nullptr; // (4, deal_n, num_levels, num_points) - T* value_sram = nullptr; // (num_keys, channels) - int32_t* data_offset_sram = nullptr; - T* weight_polation_sram = nullptr; - T* wegith_attn_sram = nullptr; - T* cond_point_polation_sram = nullptr; - int32_t stage_1_max_deal_n = 0; - int32_t stage_2_max_deal_n = 0; - int32_t max_cached_n = 0; - int32_t mask_size = 0; - memPolicy590( - zeros_nram, data_offset_nram, weight_polation_nram, - cond_point_polation_nram, cond_point_valid_nram, loc_nram, buf_nram, - buf_nram_end, mask_x_nram, mask_y_nram, spatial_offset_bd_nram, - spatial_w_bd_nram, spatial_h_bd_nram, spatial_offset_nram, - spatial_hw_nram, value_ping_nram, value_pong_nram, compute_buffer_nram, - weight_polation_nram_stg2, weight_attn_nram_stg2, offset_nram_stg2, - output_nram, cond_nram_stg2, data_offset_sram, weight_polation_sram, - wegith_attn_sram, cond_point_polation_sram, nram_buffer, sram_buffer, - max_cached_n, stage_1_max_deal_n, stage_2_max_deal_n, mask_size, - NRAM_AVALIABLE_SIZE, SRAM_AVALIABLE_SIZE, batch_size, num_keys, num_heads, - channels, num_levels, num_queries, num_points); - if (stage_1_max_deal_n <= 0 || stage_2_max_deal_n <= 0) { - return; - } - - int32_t cluster_begin_batch_head = 0; - int32_t cluster_act_batch_head = 0; - int32_t cluster_end_batch_head = 0; - int32_t core_begin_query = 0; - int32_t core_act_query = 0; - int32_t core_loop_num = 0; - int32_t core_step_query = 0; - splitTaskV2(cluster_begin_batch_head, cluster_act_batch_head, - cluster_end_batch_head, core_begin_query, core_act_query, - core_loop_num, core_step_query, max_cached_n, batch_size, - num_keys, num_heads, channels, num_levels, num_queries, - num_points); - - prepareLoopV2((int32_t*)nullptr, zeros_nram, spatial_offset_nram, - spatial_hw_nram, mask_x_nram, mask_y_nram, - spatial_offset_bd_nram, spatial_h_bd_nram, spatial_w_bd_nram, - value_sram, data_level_start_index_gdram, - data_spatial_shapes_gdram, num_keys, num_levels, num_points, - stage_1_max_deal_n, mask_size, channels); - - for (int32_t bh_idx = cluster_begin_batch_head; - bh_idx < cluster_end_batch_head; bh_idx++) { - int32_t b = bh_idx / num_heads; - int32_t head_idx = bh_idx % num_heads; - size_t output_base_offset = - (size_t)b * output_stride_3 + head_idx * channels; - size_t attn_weight_base_offset = - (size_t)b * input_stride_4 + head_idx * input_stride_2; - size_t data_value_base_offset = - (size_t)b * data_value_stride_3 + head_idx * channels; - - for (int32_t i = 0; __is_ipu() && i < core_loop_num; i++) { - int32_t deal_n = - std::min(core_act_query - core_step_query * i, core_step_query); - int32_t core_query_offset = i * core_step_query; - size_t attn_weight_offset = - attn_weight_base_offset + - (core_begin_query + core_query_offset) * input_stride_3; - size_t loc_offset = attn_weight_offset * 2; - size_t output_offset = - output_base_offset + - (core_begin_query + i * core_step_query) * output_stride_2; - - // compute offset/cond/wp - stageOneLoop((T*)data_sampling_loc_gdram + loc_offset, - (T*)data_attn_weight_gdram + attn_weight_offset, - data_offset_nram, nullptr, weight_polation_nram, - cond_point_polation_nram, cond_point_valid_nram, loc_nram, - buf_nram, buf_nram_end, mask_x_nram, mask_y_nram, - spatial_offset_bd_nram, spatial_w_bd_nram, spatial_h_bd_nram, - spatial_offset_nram, spatial_hw_nram, data_offset_sram, - nullptr, weight_polation_sram, wegith_attn_sram, - cond_point_polation_sram, false, false, deal_n, - stage_1_max_deal_n, num_heads, channels, num_levels, - num_points, input_stride_2, input_stride_3); - - // compute and store output - forwardStageTwoLoop( - value_ping_nram, value_pong_nram, compute_buffer_nram, zeros_nram, - weight_polation_nram_stg2, weight_attn_nram_stg2, offset_nram_stg2, - output_nram, cond_nram_stg2, data_offset_sram, weight_polation_sram, - wegith_attn_sram, cond_point_polation_sram, - (T*)data_value_gdram + data_value_base_offset, - (T*)data_attn_weight_gdram + attn_weight_offset, - (T*)data_col_gdram + output_offset, deal_n, stage_2_max_deal_n, - input_stride_2, input_stride_3, output_stride_2, num_heads, channels, - num_levels, num_points); - } - } -} - -#endif - -template -__mlu_global__ void MLUKernelMsDeformAttnForwardFast( - const char* data_value_gdram, const char* data_spatial_shapes_gdram, - const char* data_level_start_index_gdram, - const char* data_sampling_loc_gdram, const char* data_attn_weight_gdram, - const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, const int32_t num_queries, - const int32_t num_points, char* data_col_gdram) { -#if (__BANG_ARCH__ == 372) - size_t single_value_size = num_keys * channels * sizeof(T); - if (single_value_size <= SRAM_FOR_VALUE_SIZE) { - MLUKernelMsDeformAttnForwardFastImpl( - data_value_gdram, data_spatial_shapes_gdram, - data_level_start_index_gdram, data_sampling_loc_gdram, - data_attn_weight_gdram, batch_size, num_keys, num_heads, channels, - num_levels, num_queries, num_points, data_col_gdram); - } else { - MLUKernelMsDeformAttnForwardFastImpl( - data_value_gdram, data_spatial_shapes_gdram, - data_level_start_index_gdram, data_sampling_loc_gdram, - data_attn_weight_gdram, batch_size, num_keys, num_heads, channels, - num_levels, num_queries, num_points, data_col_gdram); - } -#endif - -#if (__BANG_ARCH__ == 592) - MLUKernelMsDeformAttnForwardFastImpl( - data_value_gdram, data_spatial_shapes_gdram, data_level_start_index_gdram, - data_sampling_loc_gdram, data_attn_weight_gdram, batch_size, num_keys, - num_heads, channels, num_levels, num_queries, num_points, data_col_gdram); -#endif -} - -template __mlu_global__ void MLUKernelMsDeformAttnForwardFast( - const char* data_value_gdram, const char* data_spatial_shapes_gdram, - const char* data_level_start_index_gdram, - const char* data_sampling_loc_gdram, const char* data_attn_weight_gdram, - const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, const int32_t num_queries, - const int32_t num_points, char* data_col_gdram); diff --git a/kernels/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu b/kernels/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu deleted file mode 100644 index 398fe9679..000000000 --- a/kernels/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu +++ /dev/null @@ -1,557 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include - -#include "kernels/ms_deform_attn_forward/ms_deform_attn_forward.h" - -#define ELE_COUNT 32 /* cycle element count */ - -__nram__ char nram_buffer[MAX_NRAM_SIZE]; - -__mlu_func__ void genMask0101(float *mask_ram, int32_t size) { -#if __BANG_ARCH__ >= 372 - int32_t align_num = NFU_ALIGN_SIZE / sizeof(float); - for (int32_t i = 0; i < align_num; ++i) { - mask_ram[i] = i % 2; - } - __sync(); - // NOTE: when channel is 1, mask_ram may be overwritten, since we - // align size to CEIL_ALIGN(size, align_num) - __memcpy(mask_ram + align_num, mask_ram, NFU_ALIGN_SIZE, NRAM2NRAM, - NFU_ALIGN_SIZE, 0, (size / align_num + (size % align_num > 0)) - 2); - __sync(); -#endif -} - -template -__mlu_global__ void MLUKernelMsDeformAttnForwardSmallChannel( - const char *data_value_gdram, const char *data_spatial_shapes_gdram, - const char *data_level_start_index_gdram, - const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram, - const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, const int32_t num_queries, - const int32_t num_points, char *data_col_gdram) { -#if __BANG_ARCH__ >= 372 - if (__is_mpu()) { - return; - } - size_t block_num_per_core = 0, batch_start = 0, deal_g = 0, offset_g = 0; - size_t block_num_rem = 0; - const size_t grid_total = num_queries * num_heads * num_levels * num_points; - if (batch_size >= taskDim) { - block_num_rem = batch_size % taskDim; - block_num_per_core = taskId < block_num_rem ? batch_size / taskDim + 1 - : batch_size / taskDim; - batch_start = taskId < block_num_rem - ? taskId * block_num_per_core - : taskId * block_num_per_core + block_num_rem; - deal_g = grid_total; - offset_g = 0; - } else { - size_t skip_n = taskDim / batch_size; - batch_start = taskId / skip_n; - block_num_per_core = batch_start >= batch_size ? 0 : 1; - deal_g = PAD_UP(grid_total / skip_n, num_levels * num_points); - size_t id = taskId % skip_n; - offset_g = id * deal_g; - deal_g = offset_g > grid_total ? 0 - : ((id + 1) *deal_g > grid_total - ? deal_g = grid_total - offset_g - : deal_g); - } - if (deal_g == 0) { - return; - } - const int32_t float_align = NFU_ALIGN_SIZE / sizeof(float); - int32_t deal_num = 1; - int32_t cut_channel_iter = 2; - const size_t spatial_size = - PAD_UP(num_levels * 2 * sizeof(int32_t), NFU_ALIGN_SIZE); - const size_t level_start_index_size = - PAD_UP(num_levels * sizeof(int32_t), NFU_ALIGN_SIZE); - int32_t channel = channels; - int32_t mult; - while (true) { - deal_num = (MAX_NRAM_SIZE - spatial_size - level_start_index_size) / - (8 * channel + 8) / sizeof(T); - deal_num = PAD_DOWN(deal_num, float_align); - deal_num = PAD_DOWN(deal_num, num_levels * num_points); - if (deal_num > 0) { - break; - } else { - channel = channels / cut_channel_iter; - cut_channel_iter += 2; - } - } - mult = channel; - const int32_t c_rep = channels / channel; - const int32_t c_rem = channels % channel; - const int32_t g_rep = deal_g / deal_num; - const int32_t g_rem = deal_g % deal_num; - // nram buffer alloc - char *data_spatial_shapes_nram = nram_buffer; - char *data_level_start_index_nram = data_spatial_shapes_nram + spatial_size; - char *input_tl = data_level_start_index_nram + level_start_index_size; - char *input_tr = input_tl + deal_num * mult * sizeof(T); - char *input_bl = input_tr + deal_num * mult * sizeof(T); - char *input_br = input_bl + deal_num * mult * sizeof(T); - char *weight_tl = input_tl + 4 * deal_num * mult * sizeof(T); - char *weight_tr = weight_tl + deal_num * mult * sizeof(T); - char *weight_bl = weight_tr + deal_num * mult * sizeof(T); - char *weight_br = weight_bl + deal_num * mult * sizeof(T); - char *mask_tl = weight_br + deal_num * mult * sizeof(T); - char *mask_tr = mask_tl + deal_num * sizeof(T); - char *mask_bl = mask_tr + deal_num * sizeof(T); - char *mask_br = mask_bl + deal_num * sizeof(T); - char *point_ram = mask_br + deal_num * sizeof(T); - char *index_tl = point_ram + deal_num * sizeof(T); - char *index_bl = index_tl + deal_num * sizeof(T); - char *valid_mask = index_bl + deal_num * sizeof(T); - // nram space reuse - char *grid_ram = weight_tl; - char *mask_ram = weight_bl; - char *coord_x = input_bl; - char *coord_y = coord_x + deal_num * sizeof(T); - char *coord_x_low = input_tl; - char *coord_y_low = coord_x_low + deal_num * sizeof(T); - char *coord_x_low_int = weight_tl; - char *coord_y_low_int = weight_tr; - char *spatial_x = mask_tl; - char *spatial_y = mask_tr; - char *spatial_x_float = weight_bl; - char *spatial_y_float = weight_br; - char *spatial_x_temp = mask_bl; - char *spatial_y_temp = mask_br; -#if MS_DEFORM_ATTN_FORWARD_HEADVECTOR - char *base_ptr_offset = weight_tl; -#endif - char *auxiliary_a = point_ram; - char *auxiliary_b = weight_bl; - __memcpy_async(data_spatial_shapes_nram, data_spatial_shapes_gdram, - num_levels * 2 * sizeof(int32_t), GDRAM2NRAM); - __memcpy_async(data_level_start_index_nram, data_level_start_index_gdram, - num_levels * sizeof(int32_t), GDRAM2NRAM); - __sync(); - for (int32_t batch_idx = batch_start; - batch_idx < batch_start + block_num_per_core; ++batch_idx) { - for (int32_t grid_iter = 0; grid_iter <= g_rep; ++grid_iter) { - int32_t io_data_num = deal_num; - const int32_t grid_off_base = - batch_idx * grid_total + offset_g + grid_iter * deal_num; - if (grid_iter == g_rep) { - if (g_rem == 0) { - continue; - } else { - io_data_num = g_rem; - } - } - char *data_col_gdram_start = - data_col_gdram + (batch_idx * num_queries * num_heads * channels + - (offset_g + grid_iter * deal_num) / - (num_levels * num_points) * channels) * - sizeof(float); - // load data_sampling_loc - __memcpy_async( - grid_ram, data_sampling_loc_gdram + grid_off_base * 2 * sizeof(float), - io_data_num * 2 * sizeof(float), GDRAM2NRAM); - genMask0101((float *)mask_ram, deal_num * 2); - __sync(); - // generate x and y coordinate vector - // generate spatial_x and spatial_y spatial vector - __bang_collect((float *)coord_y, (float *)grid_ram, (float *)mask_ram, - deal_num * 2); // y - __bang_collect((float *)spatial_x_temp, (float *)data_spatial_shapes_nram, - (float *)mask_ram, - num_levels * 2); // spatial_x - __bang_not((float *)mask_ram, (float *)mask_ram, deal_num * 2); - __bang_collect((float *)coord_x, (float *)grid_ram, (float *)mask_ram, - deal_num * 2); // x - __bang_collect((float *)spatial_y_temp, (float *)data_spatial_shapes_nram, - (float *)mask_ram, - num_levels * 2); // spatial_y - for (int32_t i = 0; i < num_levels; i++) { - __bang_write_value((int32_t *)spatial_x + i * num_points, num_points, - ((int32_t *)spatial_x_temp)[i]); - __bang_write_value((int32_t *)spatial_y + i * num_points, num_points, - ((int32_t *)spatial_y_temp)[i]); - } - __bang_int322float_rd((float *)spatial_x_float, (int32_t *)spatial_x, - num_levels * num_points, 0); - __bang_int322float_rd((float *)spatial_y_float, (int32_t *)spatial_y, - num_levels * num_points, 0); - /* - map x from [0, 1] to [0, spatial_x]; - map y from [0, 1] to [0, spatial_y] - */ - __bang_cycle_mul((float *)coord_x, (float *)coord_x, - (float *)spatial_x_float, deal_num, - num_levels * num_points); - __bang_sub_scalar((float *)coord_x, (float *)coord_x, (float)0.5, - deal_num); - __bang_cycle_mul((float *)coord_y, (float *)coord_y, - (float *)spatial_y_float, deal_num, - num_levels * num_points); - __bang_sub_scalar((float *)coord_y, (float *)coord_y, (float)0.5, - deal_num); - // generate valid mask, which means the location is nan/inf or not - // condition coordx > -1 / coordy > -1 - __bang_gt_scalar((float *)auxiliary_a, (float *)coord_x, -1.0, deal_num); - __bang_move((char *)valid_mask, (char *)auxiliary_a, - deal_num * sizeof(float)); - __bang_gt_scalar((float *)auxiliary_a, (float *)coord_y, -1.0, deal_num); - __bang_add((float *)valid_mask, (float *)valid_mask, (float *)auxiliary_a, - deal_num); - - // condition coordx < spatial_x / coordy < spatial_y - __bang_cycle_le((float *)mask_bl, (float *)coord_x, - (float *)spatial_x_float, deal_num, - num_levels * num_points); - __bang_cycle_le((float *)mask_br, (float *)coord_y, - (float *)spatial_y_float, deal_num, - num_levels * num_points); - - __bang_add((float *)mask_bl, (float *)mask_bl, (float *)mask_br, - deal_num); - __bang_add((float *)valid_mask, (float *)valid_mask, (float *)mask_bl, - deal_num); - // all condition satisfied, value should be 4. - __bang_eq_scalar((float *)valid_mask, (float *)valid_mask, 4, deal_num); - - // get floor value of coord - __bang_floor((float *)coord_x_low, (float *)coord_x, deal_num); - __bang_floor((float *)coord_y_low, (float *)coord_y, deal_num); - // calc index_tl - const int32_t w_stride = num_heads * channels; - __bang_float2int32_rd((int32_t *)coord_x_low_int, (float *)coord_x_low, - deal_num, 0); - __bang_float2int32_rd((int32_t *)coord_y_low_int, (float *)coord_y_low, - deal_num, 0); - __bang_cycle_mul((int32_t *)index_tl, (int32_t *)coord_y_low_int, - (int32_t *)spatial_x, deal_num, num_levels * num_points); - __bang_add((int32_t *)index_tl, (int32_t *)index_tl, - (int32_t *)coord_x_low_int, deal_num); - __bang_mul_scalar((int32_t *)index_tl, (int32_t *)index_tl, w_stride, - deal_num); -#if MS_DEFORM_ATTN_FORWARD_HEADVECTOR - const int32_t deal_lp_num = deal_num / (num_levels * num_points); - const int32_t h_rep = deal_lp_num / num_heads; - const int32_t h_rem = deal_lp_num % num_heads; - const int32_t head_start = - ((offset_g + grid_iter * deal_num) / (num_levels * num_points)) % - num_heads; - for (int32_t iter = 0; iter < num_heads; ++iter) { - ((int32_t *)base_ptr_offset)[iter] = - ((head_start + iter) % num_heads) * channels; - } - if (h_rep > 0) { - __memcpy((int32_t *)base_ptr_offset + num_heads, - (int32_t *)base_ptr_offset, num_heads * sizeof(int32_t), - NRAM2NRAM, num_heads * sizeof(int32_t), 0, h_rep - 1); - } - if (h_rep > 0 && h_rem > 0) { - __memcpy((int32_t *)base_ptr_offset + h_rep * num_heads, - (int32_t *)base_ptr_offset, h_rem * sizeof(int32_t), - NRAM2NRAM); - } - __bang_transpose((int32_t *)auxiliary_a, (int32_t *)index_tl, deal_lp_num, - num_levels * num_points); - __bang_cycle_add((int32_t *)auxiliary_a, (int32_t *)auxiliary_a, - (int32_t *)base_ptr_offset, deal_num, deal_lp_num); - __bang_transpose((int32_t *)index_tl, (int32_t *)auxiliary_a, - num_levels * num_points, deal_lp_num); -#endif - // calc index_bl - __bang_mul_scalar((int32_t *)auxiliary_a, (int32_t *)spatial_x, w_stride, - deal_num); - __bang_cycle_add((int32_t *)index_bl, (int32_t *)index_tl, - (int32_t *)auxiliary_a, deal_num, - num_levels * num_points); - // calc mask_tl, mask_tr, mask_bl, mask_br - __bang_sub_scalar((float *)spatial_x_float, (float *)spatial_x_float, - (float)1.0, deal_num); - __bang_sub_scalar((float *)spatial_y_float, (float *)spatial_y_float, - (float)1.0, deal_num); - // mask_tl : - // 0 <= coord_x_low < spatial_x && 0 <= coord_y_low < spatial_y - __bang_ge_scalar((float *)mask_bl, (float *)coord_x_low, (float)0, - deal_num); - __bang_cycle_le((float *)mask_br, (float *)coord_x_low, - (float *)spatial_x_float, deal_num, - num_levels * num_points); - __bang_and((float *)mask_bl, (float *)mask_bl, (float *)mask_br, - deal_num); - __bang_ge_scalar((float *)mask_tr, (float *)coord_y_low, (float)0, - deal_num); - __bang_cycle_le((float *)mask_br, (float *)coord_y_low, - (float *)spatial_y_float, deal_num, - num_levels * num_points); - __bang_and((float *)mask_tr, (float *)mask_tr, (float *)mask_br, - deal_num); - __bang_and((float *)mask_tl, (float *)mask_tr, (float *)mask_bl, - deal_num); - // mask_tr : - // 0 <= coord_x_high < spatial_x && 0 <= coord_y_low < spatial_y - __bang_ge_scalar((float *)mask_br, (float *)coord_x_low, (float)(-1.0), - deal_num); - __bang_cycle_lt((float *)auxiliary_a, (float *)coord_x_low, - (float *)spatial_x_float, deal_num, - num_levels * num_points); - __bang_and((float *)mask_br, (float *)mask_br, (float *)auxiliary_a, - deal_num); - __bang_and((float *)mask_tr, (float *)mask_tr, (float *)mask_br, - deal_num); - // mask_bl : - // 0 <= coord_x_low < spatial_x && 0 <= coord_y_high < spatial_y - __bang_ge_scalar((float *)auxiliary_a, (float *)coord_y_low, - (float)(-1.0), deal_num); - __bang_cycle_lt((float *)auxiliary_b, (float *)coord_y_low, - (float *)spatial_y_float, deal_num, - num_levels * num_points); - __bang_and((float *)auxiliary_a, (float *)auxiliary_a, - (float *)auxiliary_b, deal_num); - __bang_and((float *)mask_bl, (float *)mask_bl, (float *)auxiliary_a, - deal_num); - // mask_br : - // 0 <= coord_x_high < spatial_x && 0 <= coord_y_high < spatial_y - __bang_and((float *)mask_br, (float *)mask_br, (float *)auxiliary_a, - deal_num); - // if loc has nan/inf, fill invalid value with 0. - // Note, althrough nan joins the compatution, the comparison returns - // normal value. - __bang_cycle_and((float *)mask_tl, (float *)mask_tl, (float *)valid_mask, - 4 * deal_num, deal_num); - - // switch valid_mask to bit-type mask. 1 to 0xffffffff, 0 to 0x00000000 - // first we cast float32 to int32. then multiply -1, - // whose hex is 0xffffffff - __bang_float2int32_rd((int32_t *)valid_mask, (float *)valid_mask, - deal_num, 0); - __bang_mul_scalar((int32_t *)valid_mask, (int32_t *)valid_mask, -1, - deal_num); - - // calc inner point num - __bang_mul_scalar((float *)weight_tl, (float *)mask_tl, (float)7.0, - deal_num); - __bang_mul_scalar((float *)weight_tr, (float *)mask_tr, (float)5.0, - deal_num); - __bang_add((float *)weight_tl, (float *)weight_tl, (float *)weight_tr, - deal_num); - __bang_mul_scalar((float *)weight_tr, (float *)mask_bl, (float)3.0, - deal_num); - __bang_add((float *)point_ram, (float *)weight_tr, (float *)mask_br, - deal_num); - __bang_add((float *)point_ram, (float *)point_ram, (float *)weight_tl, - deal_num); - // calc interpolation weight - __bang_sub((float *)weight_bl, (float *)coord_x_low, (float *)coord_x, - deal_num); - __bang_sub((float *)weight_br, (float *)coord_y_low, (float *)coord_y, - deal_num); - __bang_add_scalar((float *)weight_bl, (float *)weight_bl, (float)1.0, - deal_num); - __bang_add_scalar((float *)weight_br, (float *)weight_br, (float)1.0, - deal_num); - __bang_sub((float *)weight_tl, (float *)coord_x, (float *)coord_x_low, - deal_num); - __bang_sub((float *)weight_tr, (float *)coord_y, (float *)coord_y_low, - deal_num); - __bang_mul((float *)input_tl, (float *)weight_bl, (float *)weight_br, - deal_num); - __bang_mul((float *)input_tl + deal_num, (float *)weight_br, - (float *)weight_tl, deal_num); - __bang_mul((float *)input_tl + 2 * deal_num, (float *)weight_bl, - (float *)weight_tr, deal_num); - __bang_mul((float *)input_tl + 3 * deal_num, (float *)weight_tl, - (float *)weight_tr, deal_num); - // if loc has nan/inf, fill all invalid potision with 0. - // Note that this operation handles in bit-scale. - __bang_cycle_band((char *)input_tl, (char *)input_tl, (char *)valid_mask, - 4 * deal_num * sizeof(float), deal_num * sizeof(float)); - __sync(); - // extend weight - const int32_t w_rep = channel / ELE_COUNT * ELE_COUNT; - const int32_t w_rem = channel % ELE_COUNT; - if (w_rem != 0) { - const int32_t data_sz = 1 * sizeof(float); - const int32_t dst_str = channel * sizeof(float); - for (int32_t iter = w_rep; iter < channel; ++iter) { - __memcpy_async((float *)weight_tl + iter, (float *)input_tl, data_sz, - NRAM2NRAM, dst_str, data_sz, 4 * deal_num - 1); - } - } - if (w_rep != 0) { - for (int32_t i = 0; i < 4 * deal_num; i++) { - __bang_write_value((float *)weight_tl + i * channel, w_rep, - ((float *)input_tl)[i]); - } - } - __sync(); - const char *data_value_gdram_start = - data_value_gdram + - batch_idx * num_keys * num_heads * channels * sizeof(float); - const int32_t c_str = deal_num * channel * sizeof(float); - const int32_t cs_str = num_heads * channels * sizeof(float); - for (int32_t c_iter = 0; c_iter <= c_rep; ++c_iter) { - int32_t c_real_num = channel; - if (c_iter == c_rep) { - if (c_rem == 0) { - continue; - } else { - c_real_num = c_rem; - } - } - __bang_write_zero((float *)input_tl, 4 * deal_num * channel); - __sync(); - // load data_value - for (int32_t p_idx = 0; p_idx < io_data_num; ++p_idx) { - const int32_t inner_point_num = (int32_t)((float *)point_ram)[p_idx]; - const int32_t tl_offset = ((int32_t *)index_tl)[p_idx]; - const int32_t bl_offset = ((int32_t *)index_bl)[p_idx]; - const int32_t level_start_id = - ((int32_t *)data_level_start_index_nram)[(p_idx / num_points) % - num_levels]; -#if MS_DEFORM_ATTN_FORWARD_HEADVECTOR - const char *data_value_ptr = - data_value_gdram_start + - (level_start_id * num_heads * channels + c_iter * channel) * - sizeof(float); -#else - const int32_t head_idx = ((p_idx + offset_g + grid_iter * deal_num) / - (num_levels * num_points)) % - num_heads; - const char *data_value_ptr = - data_value_gdram_start + - (level_start_id * num_heads * channels + head_idx * channels + - c_iter * channel) * - sizeof(float); -#endif - switch (inner_point_num) { - case 16: // 4 points are cached. - __memcpy_async((float *)input_tl + p_idx * channel, - (float *)data_value_ptr + tl_offset, - c_real_num * sizeof(float), GDRAM2NRAM, c_str, - cs_str, 1); - __memcpy_async((float *)input_bl + p_idx * channel, - (float *)data_value_ptr + bl_offset, - c_real_num * sizeof(float), GDRAM2NRAM, c_str, - cs_str, 1); - break; - case 12: // 2 points are cached. (top_left, top_right) - __memcpy_async((float *)input_tl + p_idx * channel, - (float *)data_value_ptr + tl_offset, - c_real_num * sizeof(float), GDRAM2NRAM, c_str, - cs_str, 1); - break; - case 4: // 2 points are cached. (bottom_left, bottom_right) - __memcpy_async((float *)input_bl + p_idx * channel, - (float *)data_value_ptr + bl_offset, - c_real_num * sizeof(float), GDRAM2NRAM, c_str, - cs_str, 1); - break; - case 10: // 2 points are cached. (top_left, bottom_left) - __memcpy_async((float *)input_tl + p_idx * channel, - (float *)data_value_ptr + tl_offset, - c_real_num * sizeof(float), GDRAM2NRAM); - __memcpy_async((float *)input_bl + p_idx * channel, - (float *)data_value_ptr + bl_offset, - c_real_num * sizeof(float), GDRAM2NRAM); - break; - case 6: // 2 points are cached. (top_right, bottom_right) - __memcpy_async( - (float *)input_tr + p_idx * channel, - (float *)data_value_ptr + tl_offset + num_heads * channels, - c_real_num * sizeof(float), GDRAM2NRAM); - __memcpy_async( - (float *)input_br + p_idx * channel, - (float *)data_value_ptr + bl_offset + num_heads * channels, - c_real_num * sizeof(float), GDRAM2NRAM); - break; - case 7: // 1 point is cached. (top_left) - __memcpy_async((float *)input_tl + p_idx * channel, - (float *)data_value_ptr + tl_offset, - c_real_num * sizeof(float), GDRAM2NRAM); - break; - case 5: // 1 point is cached. (top_right) - __memcpy_async( - (float *)input_tr + p_idx * channel, - (float *)data_value_ptr + tl_offset + num_heads * channels, - c_real_num * sizeof(float), GDRAM2NRAM); - break; - case 3: // 1 point is cached. (bottom_left) - __memcpy_async((float *)input_bl + p_idx * channel, - (float *)data_value_ptr + bl_offset, - c_real_num * sizeof(float), GDRAM2NRAM); - break; - case 1: // 1 point is cached. (bottom_right) - __memcpy_async( - (float *)input_br + p_idx * channel, - (float *)data_value_ptr + bl_offset + num_heads * channels, - c_real_num * sizeof(float), GDRAM2NRAM); - break; - default: - continue; - } - } - __sync(); - // interpolation - __bang_mul((float *)input_tl, (float *)input_tl, (float *)weight_tl, - 4 * deal_num * channel); - __bang_add((float *)input_tl, (float *)input_tl, (float *)input_bl, - 2 * deal_num * channel); - __bang_add((float *)input_tl, (float *)input_tl, (float *)input_tr, - deal_num * channel); - // load attention weight - void *attn_weight = mask_tl; - __memcpy((float *)attn_weight, - (float *)data_attn_weight_gdram + grid_off_base, - io_data_num * sizeof(float), GDRAM2NRAM); - // calc data_col, muladd attention weight - __bang_transpose((float *)input_tr, (float *)input_tl, deal_num, - channel); - __bang_cycle_mul((float *)input_tr, (float *)input_tr, - (float *)attn_weight, deal_num * channel, deal_num); - __bang_transpose((float *)input_tl, (float *)input_tr, channel, - deal_num); - __bang_sumpool((float *)input_bl, (float *)input_tl, channel, 1, - io_data_num, 1, num_levels * num_points, - num_levels * num_points, 1); - // store - __memcpy((float *)data_col_gdram_start + c_iter * channel, - (float *)input_bl, c_real_num * sizeof(float), NRAM2GDRAM, - channels * sizeof(float), channel * sizeof(float), - (io_data_num / (num_levels * num_points)) - 1); - } - } - } - __sync(); - return; -#endif -} - -template __mlu_global__ void MLUKernelMsDeformAttnForwardSmallChannel( - const char *data_value_gdram, const char *data_spatial_shapes_gdram, - const char *data_level_start_index_gdram, - const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram, - const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, const int32_t num_queries, - const int32_t num_points, char *data_col_gdram); diff --git a/kernels/ms_deform_attn_forward/msda_forward_union1_default.mlu b/kernels/ms_deform_attn_forward/msda_forward_union1_default.mlu deleted file mode 100644 index 18ec006f5..000000000 --- a/kernels/ms_deform_attn_forward/msda_forward_union1_default.mlu +++ /dev/null @@ -1,484 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include - -#include "kernels/ms_deform_attn_forward/ms_deform_attn_forward.h" - -#define TWELVE_SPLIT 12 -#define ELE_COUNT 32 /* cycle element count */ - -__nram__ char nram_buffer[MAX_NRAM_SIZE]; - -template -__mlu_func__ void loadNeighborPointsData( - const T *data_value_gdram, T *data_value_p1_nram, T *data_value_p2_nram, - T *data_value_p3_nram, T *data_value_p4_nram, const size_t &deal_num, - const int32_t &width, const int32_t &height, const int32_t &num_heads, - const int32_t &channels, const T &x, const T &y, const int32_t &head_idx) { - const int32_t w_low = floorf(x); - const int32_t h_low = floorf(y); - const int32_t w_high = w_low + 1; - const int32_t h_high = h_low + 1; - const int32_t w_stride = num_heads * channels; - const int32_t h_stride = width * w_stride; - const int32_t h_low_ptr_offset = h_low * h_stride; - const int32_t h_high_ptr_offset = h_low_ptr_offset + h_stride; - const int32_t w_low_ptr_offset = w_low * w_stride; - const int32_t w_high_ptr_offset = w_low_ptr_offset + w_stride; - const int32_t base_ptr_offset = head_idx * channels; - // top-left point - if (h_low >= 0 && w_low >= 0) { - const int32_t v1_offset = - h_low_ptr_offset + w_low_ptr_offset + base_ptr_offset; - __memcpy_async(data_value_p1_nram, data_value_gdram + v1_offset, - deal_num * sizeof(T), GDRAM2NRAM); - } - // top-right point - if (h_low >= 0 && w_high <= width - 1) { - const int32_t v2_offset = - h_low_ptr_offset + w_high_ptr_offset + base_ptr_offset; - __memcpy_async(data_value_p2_nram, data_value_gdram + v2_offset, - deal_num * sizeof(T), GDRAM2NRAM); - } - // bottom-left point - if (h_high <= height - 1 && w_low >= 0) { - const int32_t v3_offset = - h_high_ptr_offset + w_low_ptr_offset + base_ptr_offset; - __memcpy_async(data_value_p3_nram, data_value_gdram + v3_offset, - deal_num * sizeof(T), GDRAM2NRAM); - } - // bottom-right point - if (h_high <= height - 1 && w_high <= width - 1) { - const int32_t v4_offset = - h_high_ptr_offset + w_high_ptr_offset + base_ptr_offset; - __memcpy_async(data_value_p4_nram, data_value_gdram + v4_offset, - deal_num * sizeof(T), GDRAM2NRAM); - } -} - -template -__mlu_func__ void computeMsDeformAttn( - T *data_value_p1_nram, T *data_value_p2_nram, T *data_value_p3_nram, - T *data_value_p4_nram, T *sample_point_value, T *auxiliary_b, - T *data_col_nram, const T &weight, const size_t &deal_num, - const int32_t &width, const int32_t &height, const T &x, const T &y) { - const int32_t w_low = floorf(x); - const int32_t h_low = floorf(y); - const int32_t w_high = w_low + 1; - const int32_t h_high = h_low + 1; - const T lw = x - w_low; - const T lh = y - h_low; - const T hw = 1 - lw; - const T hh = 1 - lh; - const T w1 = hh * hw; - const T w2 = hh * lw; - const T w3 = lh * hw; - const T w4 = lh * lw; - - __bang_write_value((T *)sample_point_value, deal_num, (T)0); - - // top-left point - if (h_low >= 0 && w_low >= 0) { - // sample_point_value += v1 * w1 - __bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p1_nram, (T)w1, - deal_num); - __bang_add((T *)sample_point_value, (T *)sample_point_value, - (T *)auxiliary_b, deal_num); - } - // top-right point - if (h_low >= 0 && w_high <= width - 1) { - // sample_point_value += v2 * w2 - __bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p2_nram, (T)w2, - deal_num); - __bang_add((T *)sample_point_value, (T *)sample_point_value, - (T *)auxiliary_b, deal_num); - } - // bottom-left point - if (h_high <= height - 1 && w_low >= 0) { - // sample_point_value += v3 * w3 - __bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p3_nram, (T)w3, - deal_num); - __bang_add((T *)sample_point_value, (T *)sample_point_value, - (T *)auxiliary_b, deal_num); - } - // bottom-right point - if (h_high <= height - 1 && w_high <= width - 1) { - // sample_point_value += v4 * w4 - __bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p4_nram, (T)w4, - deal_num); - __bang_add((T *)sample_point_value, (T *)sample_point_value, - (T *)auxiliary_b, deal_num); - } - __bang_mul_scalar((T *)sample_point_value, (T *)sample_point_value, (T)weight, - deal_num); - __bang_add((T *)data_col_nram, (T *)data_col_nram, (T *)sample_point_value, - deal_num); -} - -template -__mlu_global__ void MLUKernelMsDeformAttnForwardDefault( - const char *data_value_gdram, const char *data_spatial_shapes_gdram, - const char *data_level_start_index_gdram, - const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram, - const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, const int32_t num_queries, - const int32_t num_points, char *data_col_gdram) { - if (__is_mpu()) { - return; - } - const size_t spatial_size = PAD_UP(2 * sizeof(int32_t), NFU_ALIGN_SIZE); - const size_t span_num_deal = - PAD_DOWN((MAX_NRAM_SIZE - spatial_size) / TWELVE_SPLIT / sizeof(T), - NFU_ALIGN_SIZE); - const size_t align_num = NFU_ALIGN_SIZE; - const int32_t channels_seg_num = channels / span_num_deal; - const size_t channels_rem = channels % span_num_deal; - const size_t channels_align_rem = CEIL_ALIGN(channels_rem, align_num); - char *data_spatial_shapes_nram = nram_buffer; - char *ping_data_value_p1_nram = data_spatial_shapes_nram + spatial_size; - char *ping_data_value_p2_nram = - ping_data_value_p1_nram + span_num_deal * sizeof(T); - char *ping_data_value_p3_nram = - ping_data_value_p2_nram + span_num_deal * sizeof(T); - char *ping_data_value_p4_nram = - ping_data_value_p3_nram + span_num_deal * sizeof(T); - char *ping_data_col_nram = - ping_data_value_p4_nram + span_num_deal * sizeof(T); - char *pong_data_value_p1_nram = - ping_data_col_nram + span_num_deal * sizeof(T); - char *pong_data_value_p2_nram = - pong_data_value_p1_nram + span_num_deal * sizeof(T); - char *pong_data_value_p3_nram = - pong_data_value_p2_nram + span_num_deal * sizeof(T); - char *pong_data_value_p4_nram = - pong_data_value_p3_nram + span_num_deal * sizeof(T); - char *pong_data_col_nram = - pong_data_value_p4_nram + span_num_deal * sizeof(T); - char *auxiliary_a = pong_data_col_nram + span_num_deal * sizeof(T); - char *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T); - const size_t ping_pong_gap = 5 * span_num_deal * sizeof(T); - size_t data_col_ping_pong_idx = 0; - int32_t block_num_per_core = (batch_size * num_queries * num_heads) / taskDim; - const int32_t block_num_rem = - (batch_size * num_queries * num_heads) % taskDim; - const int32_t idx_start = taskId < (block_num_rem + 1) - ? taskId * (block_num_per_core + 1) - : taskId * block_num_per_core + block_num_rem; - block_num_per_core = - taskId < block_num_rem - ? (batch_size * num_queries * num_heads) / taskDim + 1 - : (batch_size * num_queries * num_heads) / taskDim; - for (int32_t cur_idx = idx_start; cur_idx < idx_start + block_num_per_core; - ++cur_idx) { - /* - cur_idx = batch_idx * num_queries * num_heads + - query_idx * num_heads + head_idx - */ - const int32_t head_idx = cur_idx % num_heads; - const int32_t batch_idx = (cur_idx / num_heads) / num_queries; - const char *data_value_gdram_start = - data_value_gdram + - batch_idx * num_keys * num_heads * channels * sizeof(T); - const char *data_sampling_loc_gdram_start = - data_sampling_loc_gdram + - cur_idx * num_levels * num_points * 2 * sizeof(T); - const char *data_attn_weight_gdram_start = - data_attn_weight_gdram + cur_idx * num_levels * num_points * sizeof(T); - char *data_col_gdram_start = - data_col_gdram + cur_idx * channels * sizeof(T); - for (int32_t c_seg_idx = 0; c_seg_idx < channels_seg_num; ++c_seg_idx) { - __bang_write_value( - (T *)(ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap), - span_num_deal, (T)0); - // load data - // level_idx = 0, point_idx = 0 - __memcpy(data_spatial_shapes_nram, data_spatial_shapes_gdram, - 2 * sizeof(int32_t), GDRAM2NRAM); - int32_t spatial_h = ((int32_t *)data_spatial_shapes_nram)[0]; - int32_t spatial_w = ((int32_t *)data_spatial_shapes_nram)[1]; - const char *data_value_ptr = - data_value_gdram_start + c_seg_idx * span_num_deal * sizeof(T); - T loc_w = ((T *)data_sampling_loc_gdram_start)[0]; - T loc_h = ((T *)data_sampling_loc_gdram_start)[1]; - T weight = ((T *)data_attn_weight_gdram_start)[0]; - T x = loc_w * spatial_w - 0.5; - T y = loc_h * spatial_h - 0.5; - if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) { - loadNeighborPointsData( - (T *)data_value_ptr, (T *)ping_data_value_p1_nram, - (T *)ping_data_value_p2_nram, (T *)ping_data_value_p3_nram, - (T *)ping_data_value_p4_nram, span_num_deal, spatial_w, spatial_h, - num_heads, channels, x, y, head_idx); - } - T spatial_h_next_point = 0; - T spatial_w_next_point = 0; - T weight_next_point = 0; - T x_next_point = 0; - T y_next_point = 0; - __sync(); - for (int32_t level_idx = 0; level_idx < num_levels; ++level_idx) { - for (int32_t point_idx = 0; point_idx < num_points; ++point_idx) { - // load data - if (point_idx == num_points - 1 && level_idx == num_levels - 1) { - // last point no need to load data, continue to compute - } else if (point_idx == num_points - 1) { - const int32_t level_start_id = - ((int32_t *)data_level_start_index_gdram)[level_idx + 1]; - const int32_t spatial_h_ptr = (level_idx + 1) << 1; - __memcpy( - data_spatial_shapes_nram, - data_spatial_shapes_gdram + spatial_h_ptr * sizeof(int32_t), - 2 * sizeof(int32_t), GDRAM2NRAM); - spatial_h_next_point = ((int32_t *)data_spatial_shapes_nram)[0]; - spatial_w_next_point = ((int32_t *)data_spatial_shapes_nram)[1]; - data_value_ptr = data_value_gdram_start + - (level_start_id * num_heads * channels + - c_seg_idx * span_num_deal) * - sizeof(T); - loc_w = ((T *)data_sampling_loc_gdram_start) - [(level_idx * num_points + point_idx + 1) * 2]; - loc_h = ((T *)data_sampling_loc_gdram_start) - [(level_idx * num_points + point_idx + 1) * 2 + 1]; - weight_next_point = - ((T *)data_attn_weight_gdram_start)[level_idx * num_points + - point_idx + 1]; - x_next_point = loc_w * spatial_w_next_point - 0.5; - y_next_point = loc_h * spatial_h_next_point - 0.5; - if (y_next_point > -1 && x_next_point > -1 && - y_next_point < spatial_h_next_point && - x_next_point < spatial_w_next_point) { - loadNeighborPointsData( - (T *)data_value_ptr, - (T *)(ping_data_value_p1_nram + - ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT - (T *)(ping_data_value_p2_nram + - ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT - (T *)(ping_data_value_p3_nram + - ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT - (T *)(ping_data_value_p4_nram + - ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT - span_num_deal, spatial_w_next_point, spatial_h_next_point, - num_heads, channels, x_next_point, y_next_point, head_idx); - } - } else { - spatial_h_next_point = spatial_h; - spatial_w_next_point = spatial_w; - loc_w = ((T *)data_sampling_loc_gdram_start) - [(level_idx * num_points + point_idx + 1) * 2]; - loc_h = ((T *)data_sampling_loc_gdram_start) - [(level_idx * num_points + point_idx + 1) * 2 + 1]; - weight_next_point = - ((T *)data_attn_weight_gdram_start)[level_idx * num_points + - point_idx + 1]; - x_next_point = loc_w * spatial_w - 0.5; - y_next_point = loc_h * spatial_h - 0.5; - if (y_next_point > -1 && x_next_point > -1 && - y_next_point < spatial_h && x_next_point < spatial_w) { - loadNeighborPointsData( - (T *)data_value_ptr, - (T *)(ping_data_value_p1_nram + - ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT - (T *)(ping_data_value_p2_nram + - ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT - (T *)(ping_data_value_p3_nram + - ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT - (T *)(ping_data_value_p4_nram + - ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT - span_num_deal, spatial_w, spatial_h, num_heads, channels, - x_next_point, y_next_point, head_idx); - } - } - // compute - if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) { - computeMsDeformAttn( - (T *)(ping_data_value_p1_nram + - ((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT - (T *)(ping_data_value_p2_nram + - ((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT - (T *)(ping_data_value_p3_nram + - ((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT - (T *)(ping_data_value_p4_nram + - ((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT - (T *)auxiliary_a, (T *)auxiliary_b, - (T *)(ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap), // NOLINT - weight, span_num_deal, spatial_w, spatial_h, x, y); - } - spatial_w = spatial_w_next_point; - spatial_h = spatial_h_next_point; - weight = weight_next_point; - x = x_next_point; - y = y_next_point; - __sync(); - } - } - // store - __memcpy_async( - data_col_gdram_start + c_seg_idx * span_num_deal * sizeof(T), - ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap, - span_num_deal * sizeof(T), NRAM2GDRAM); - data_col_ping_pong_idx = (data_col_ping_pong_idx + 1) % 2; - } - if (channels_rem > 0) { - __bang_write_value( - (T *)(ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap), - channels_align_rem, (T)0); - // load data - // level_idx = 0, point_idx = 0 - __memcpy(data_spatial_shapes_nram, data_spatial_shapes_gdram, - 2 * sizeof(int32_t), GDRAM2NRAM); - int32_t spatial_h = ((int32_t *)data_spatial_shapes_nram)[0]; - int32_t spatial_w = ((int32_t *)data_spatial_shapes_nram)[1]; - const char *data_value_ptr = - data_value_gdram_start + channels_seg_num * span_num_deal * sizeof(T); - T loc_w = ((T *)data_sampling_loc_gdram_start)[0]; - T loc_h = ((T *)data_sampling_loc_gdram_start)[1]; - T weight = ((T *)data_attn_weight_gdram_start)[0]; - T x = loc_w * spatial_w - 0.5; - T y = loc_h * spatial_h - 0.5; - if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) { - loadNeighborPointsData( - (T *)data_value_ptr, (T *)ping_data_value_p1_nram, - (T *)ping_data_value_p2_nram, (T *)ping_data_value_p3_nram, - (T *)ping_data_value_p4_nram, channels_rem, spatial_w, spatial_h, - num_heads, channels, x, y, head_idx); - } - T spatial_h_next_point = 0; - T spatial_w_next_point = 0; - T weight_next_point = 0; - T x_next_point = 0; - T y_next_point = 0; - __sync(); - for (int32_t level_idx = 0; level_idx < num_levels; ++level_idx) { - for (int32_t point_idx = 0; point_idx < num_points; ++point_idx) { - // load data - if (point_idx == num_points - 1 && level_idx == num_levels - 1) { - // last point no need to load data, continue to compute - } else if (point_idx == num_points - 1) { - const int32_t level_start_id = - ((int32_t *)data_level_start_index_gdram)[level_idx + 1]; - const int32_t spatial_h_ptr = (level_idx + 1) << 1; - __memcpy( - data_spatial_shapes_nram, - data_spatial_shapes_gdram + spatial_h_ptr * sizeof(int32_t), - 2 * sizeof(int32_t), GDRAM2NRAM); - spatial_h_next_point = ((int32_t *)data_spatial_shapes_nram)[0]; - spatial_w_next_point = ((int32_t *)data_spatial_shapes_nram)[1]; - data_value_ptr = data_value_gdram_start + - (level_start_id * num_heads * channels + - channels_seg_num * span_num_deal) * - sizeof(T); - loc_w = ((T *)data_sampling_loc_gdram_start) - [(level_idx * num_points + point_idx + 1) * 2]; - loc_h = ((T *)data_sampling_loc_gdram_start) - [(level_idx * num_points + point_idx + 1) * 2 + 1]; - weight_next_point = - ((T *)data_attn_weight_gdram_start)[level_idx * num_points + - point_idx + 1]; - x_next_point = loc_w * spatial_w_next_point - 0.5; - y_next_point = loc_h * spatial_h_next_point - 0.5; - if (y_next_point > -1 && x_next_point > -1 && - y_next_point < spatial_h_next_point && - x_next_point < spatial_w_next_point) { - loadNeighborPointsData( - (T *)data_value_ptr, - (T *)(ping_data_value_p1_nram + - ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT - (T *)(ping_data_value_p2_nram + - ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT - (T *)(ping_data_value_p3_nram + - ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT - (T *)(ping_data_value_p4_nram + - ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT - channels_rem, spatial_w_next_point, spatial_h_next_point, - num_heads, channels, x_next_point, y_next_point, head_idx); - } - } else { - spatial_w_next_point = spatial_w; - spatial_h_next_point = spatial_h; - loc_w = ((T *)data_sampling_loc_gdram_start) - [(level_idx * num_points + point_idx + 1) * 2]; - loc_h = ((T *)data_sampling_loc_gdram_start) - [(level_idx * num_points + point_idx + 1) * 2 + 1]; - weight_next_point = - ((T *)data_attn_weight_gdram_start)[level_idx * num_points + - point_idx + 1]; - x_next_point = loc_w * spatial_w - 0.5; - y_next_point = loc_h * spatial_h - 0.5; - if (y_next_point > -1 && x_next_point > -1 && - y_next_point < spatial_h && x_next_point < spatial_w) { - loadNeighborPointsData( - (T *)data_value_ptr, - (T *)(ping_data_value_p1_nram + - ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT - (T *)(ping_data_value_p2_nram + - ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT - (T *)(ping_data_value_p3_nram + - ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT - (T *)(ping_data_value_p4_nram + - ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT - channels_rem, spatial_w, spatial_h, num_heads, channels, - x_next_point, y_next_point, head_idx); - } - } - // compute - if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) { - computeMsDeformAttn( - (T *)(ping_data_value_p1_nram + - ((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT - (T *)(ping_data_value_p2_nram + - ((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT - (T *)(ping_data_value_p3_nram + - ((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT - (T *)(ping_data_value_p4_nram + - ((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT - (T *)auxiliary_a, (T *)auxiliary_b, - (T *)(ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap), // NOLINT - weight, channels_align_rem, spatial_w, spatial_h, x, y); - } - spatial_w = spatial_w_next_point; - spatial_h = spatial_h_next_point; - weight = weight_next_point; - x = x_next_point; - y = y_next_point; - __sync(); - } - } - // store - __memcpy_async( - data_col_gdram_start + channels_seg_num * span_num_deal * sizeof(T), - ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap, - channels_rem * sizeof(T), NRAM2GDRAM); - data_col_ping_pong_idx = (data_col_ping_pong_idx + 1) % 2; - } - } - __sync(); - return; -} - -template __mlu_global__ void MLUKernelMsDeformAttnForwardDefault( - const char *data_value_gdram, const char *data_spatial_shapes_gdram, - const char *data_level_start_index_gdram, - const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram, - const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, - const int32_t channels, const int32_t num_levels, const int32_t num_queries, - const int32_t num_points, char *data_col_gdram); diff --git a/kernels/mutual_information_backward/mutual_information_backward.cpp b/kernels/mutual_information_backward/mutual_information_backward.cpp deleted file mode 100644 index e4a6883d2..000000000 --- a/kernels/mutual_information_backward/mutual_information_backward.cpp +++ /dev/null @@ -1,863 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "mutual_information_backward.h" - -#include -#include - -#include "core/context.h" -#include "core/gen_case.h" -#include "core/logging.h" -#include "core/runtime/device.h" -#include "core/tensor.h" -#include "core/type.h" -#include "kernels/utils/cnnl_helper.h" - -#define API_NAME "[mluOpMutualInformationBackward]" - -mluOpStatus_t MLUOP_WIN_API mluOpGetMutualInformationBackwardWorkspaceSize( - mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, - const mluOpTensorDescriptor_t py_desc, - const mluOpTensorDescriptor_t opt_boundary_desc, - const mluOpTensorDescriptor_t p_desc, - const mluOpTensorDescriptor_t ans_grad_desc, const bool overwrite_ans_grad, - size_t *workspace_size) { - PARAM_CHECK(API_NAME, handle != nullptr); - PARAM_CHECK(API_NAME, px_desc != nullptr); - PARAM_CHECK(API_NAME, py_desc != nullptr); - PARAM_CHECK(API_NAME, p_desc != nullptr); - PARAM_CHECK(API_NAME, ans_grad_desc != nullptr); - PARAM_CHECK(API_NAME, workspace_size != nullptr); - // Use for p_grad size, only support float data type now - *workspace_size = mluOpGetTensorElementNum(p_desc) * sizeof(float); - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t checkTensorDim( - const mluOpTensorDescriptor_t px_desc, - const mluOpTensorDescriptor_t py_desc, - const mluOpTensorDescriptor_t opt_boundary_desc, - const mluOpTensorDescriptor_t p_desc, - const mluOpTensorDescriptor_t ans_grad_desc, - const mluOpTensorDescriptor_t px_grad_desc, - const mluOpTensorDescriptor_t py_grad_desc) { - if (3 != px_desc->dim) { - LOG(ERROR) << API_NAME << " The dim of px must be 3. " - << "But now the dim of px is " << px_desc->dim << "."; - return MLUOP_STATUS_BAD_PARAM; - } - if (3 != py_desc->dim) { - LOG(ERROR) << API_NAME << " The dim of py must be 3. " - << "But now the dim of py is " << py_desc->dim << "."; - return MLUOP_STATUS_BAD_PARAM; - } - if (nullptr != opt_boundary_desc && 2 != opt_boundary_desc->dim) { - LOG(ERROR) << API_NAME - << " The dim of opt_boundary must be 2 when opt_boundary is " - << "not NULL. But now the dim of opt_boundary is " - << opt_boundary_desc->dim << "."; - return MLUOP_STATUS_BAD_PARAM; - } - if (3 != p_desc->dim) { - LOG(ERROR) << API_NAME << " The dim of p must be 3. " - << "But now the dim of p is " << p_desc->dim << "."; - return MLUOP_STATUS_BAD_PARAM; - } - if (1 != ans_grad_desc->dim) { - LOG(ERROR) << API_NAME << " The dim of ans_grad must be 1. " - << "But now the dim of ans_grad is " << ans_grad_desc->dim - << "."; - return MLUOP_STATUS_BAD_PARAM; - } - if (3 != px_grad_desc->dim) { - LOG(ERROR) << API_NAME << " The dim of px_grad must be 3. " - << "But now the dim of px_grad is " << px_grad_desc->dim << "."; - return MLUOP_STATUS_BAD_PARAM; - } - if (3 != py_grad_desc->dim) { - LOG(ERROR) << API_NAME << " The dim of py_grad must be 3. " - << "But now the dim of py_grad is " << py_grad_desc->dim << "."; - return MLUOP_STATUS_BAD_PARAM; - } - - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t checkTensorShape( - const mluOpTensorDescriptor_t px_desc, - const mluOpTensorDescriptor_t py_desc, - const mluOpTensorDescriptor_t opt_boundary_desc, - const mluOpTensorDescriptor_t p_desc, - const mluOpTensorDescriptor_t ans_grad_desc, - const mluOpTensorDescriptor_t px_grad_desc, - const mluOpTensorDescriptor_t py_grad_desc) { - const int B = px_desc->dims[0]; - const int S = px_desc->dims[1]; - const int T = py_desc->dims[2]; - if (B != py_desc->dims[0] || B != p_desc->dims[0] || - B != ans_grad_desc->dims[0] || B != px_grad_desc->dims[0] || - B != py_grad_desc->dims[0]) { - LOG(ERROR) << API_NAME - << " px.shape[0], py.shape[0], p.shape[0], ans_grad.shape[0], " - << "px_grad.shape[0] and py_grad.shape[0] must be same. But now " - << "px.shape[0] is " << px_desc->dims[0] << ", py.shape[0] is " - << py_desc->dims[0] << ", p.shape[0] is " << p_desc->dims[0] - << ", ans_grad.shape[0] is " << ans_grad_desc->dims[0] - << ", px_grad.shape[0] is " << px_grad_desc->dims[0] - << ", py_grad.shape[0] is " << py_grad_desc->dims[0] << "."; - return MLUOP_STATUS_BAD_PARAM; - } - - // Currently only supports !modified, so the shape of px must be [B, S, T+1] - if (T + 1 != px_desc->dims[2]) { - LOG(ERROR) << API_NAME << " Currently only supports the case that " - << "px.shape[2] must be equal to py.shape[2] + 1. But now " - << "px.shape[2] is " << px_desc->dims[2] << ", py.shape[2] is " - << py_desc->dims[2] << "."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - - // The shape of py must be [B, S+1, T] - if (S + 1 != py_desc->dims[1]) { - LOG(ERROR) << API_NAME << " py.shape[1] must be equal to px.shape[1] + 1. " - << "But now px.shape[1] is " << px_desc->dims[1] - << ", py.shape[1] is " << py_desc->dims[1] << "."; - return MLUOP_STATUS_BAD_PARAM; - } - - // The shape of opt_boundary must be [B, 4] - if (nullptr != opt_boundary_desc && - (B != opt_boundary_desc->dims[0] || 4 != opt_boundary_desc->dims[1])) { - LOG(ERROR) << API_NAME << " When opt_boundary is not NULL, " - << "opt_boundary.shape[0] and px.shape[0] must be same, and " - << "opt_boundary.shape[1] must be 4. But now " - << "px.shape[0] is " << px_desc->dims[0] - << ", opt_boundary.shape[0] is " << opt_boundary_desc->dims[0] - << ", opt_boundary.shape[1] is " << opt_boundary_desc->dims[1] - << "."; - return MLUOP_STATUS_BAD_PARAM; - } - - // The shape of p must be [B, S+1, T+1] - if (S + 1 != p_desc->dims[1] || T + 1 != p_desc->dims[2]) { - LOG(ERROR) << API_NAME << " p.shape[1] and py.shape[1] must be same, and " - << "p.shape[2] must be equal to py.shape[2] + 1. " - << "But now p.shape[1] is " << p_desc->dims[1] - << ", py.shape[1] is " << py_desc->dims[1] << ", p.shape[2] is " - << p_desc->dims[2] << ", py.shape[2] is " << py_desc->dims[2] - << "."; - return MLUOP_STATUS_BAD_PARAM; - } - - // The shape of px and px_grad must be same: [B, S, T+1] - for (int i = 1; i < px_grad_desc->dim; ++i) { - if (px_grad_desc->dims[i] != px_desc->dims[i]) { - LOG(ERROR) << API_NAME - << " The shape of px and px_grad must be same. But now " - << "px.shape[" << i << "] is " << px_desc->dims[i] - << ", px_grad.shape[" << i << "] is " << px_grad_desc->dims[i] - << "."; - return MLUOP_STATUS_BAD_PARAM; - } - } - - // The shape of py and py_grad must be same: [B, S+1, T] - for (int i = 1; i < py_grad_desc->dim; ++i) { - if (py_grad_desc->dims[i] != py_desc->dims[i]) { - LOG(ERROR) << API_NAME - << " The shape of py and py_grad must be same. But now " - << "py.shape[" << i << "] is " << py_desc->dims[i] - << ", py_grad.shape[" << i << "] is " << py_grad_desc->dims[i] - << "."; - return MLUOP_STATUS_BAD_PARAM; - } - } - - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t checkTensorDatatype( - const mluOpTensorDescriptor_t px_desc, - const mluOpTensorDescriptor_t py_desc, - const mluOpTensorDescriptor_t opt_boundary_desc, - const mluOpTensorDescriptor_t p_desc, - const mluOpTensorDescriptor_t ans_grad_desc, - const mluOpTensorDescriptor_t px_grad_desc, - const mluOpTensorDescriptor_t py_grad_desc) { - if (MLUOP_DTYPE_FLOAT != px_desc->dtype) { - LOG(ERROR) << API_NAME - << "The data type of px currently only support float. But now " - << "the data type of px is " - << mluOpGetNameOfDataType(px_desc->dtype) << "."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - if (MLUOP_DTYPE_FLOAT != py_desc->dtype) { - LOG(ERROR) << API_NAME - << "The data type of py currently only support float. But now " - << "the data type of py is " - << mluOpGetNameOfDataType(py_desc->dtype) << "."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - if (nullptr != opt_boundary_desc && - MLUOP_DTYPE_INT64 != opt_boundary_desc->dtype) { - LOG(ERROR) << API_NAME - << "The data type of opt_boundary currently only support int64." - << " But now the data type of opt_boundary is " - << mluOpGetNameOfDataType(opt_boundary_desc->dtype) << "."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - if (MLUOP_DTYPE_FLOAT != p_desc->dtype) { - LOG(ERROR) << API_NAME - << "The data type of p currently only support float. But now " - << "the data type of p is " - << mluOpGetNameOfDataType(p_desc->dtype) << "."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - if (MLUOP_DTYPE_FLOAT != ans_grad_desc->dtype) { - LOG(ERROR) << API_NAME - << "The data type of ans_grad currently only support float. " - << "But now the data type of ans_grad is " - << mluOpGetNameOfDataType(ans_grad_desc->dtype) << "."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - if (MLUOP_DTYPE_FLOAT != px_grad_desc->dtype) { - LOG(ERROR) << API_NAME - << "The data type of px_grad currently only support float. " - << "But now the data type of px_grad is " - << mluOpGetNameOfDataType(px_grad_desc->dtype) << "."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - if (MLUOP_DTYPE_FLOAT != py_grad_desc->dtype) { - LOG(ERROR) << API_NAME - << "The data type of py_grad currently only support float. " - << "But now the data type of py_grad is " - << mluOpGetNameOfDataType(py_grad_desc->dtype) << "."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t checkTensorScaleLimit( - mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, - const mluOpTensorDescriptor_t py_desc, - const mluOpTensorDescriptor_t opt_boundary_desc, - const mluOpTensorDescriptor_t p_desc) { - // Check large tensor - // The shape of px and px_grad are the same, - // The shape of py and py_grad are the same, - // So there is no need to check the tensor num of px_grad and py_grad - if (mluOpGetTensorElementNum(px_desc) >= LARGE_TENSOR_NUM || - mluOpGetTensorElementNum(py_desc) >= LARGE_TENSOR_NUM || - (nullptr != opt_boundary_desc && - mluOpGetTensorElementNum(opt_boundary_desc) >= LARGE_TENSOR_NUM) || - mluOpGetTensorElementNum(p_desc) >= LARGE_TENSOR_NUM) { - LOG(ERROR) << API_NAME << " Overflow max tensor num." - << " Current operator supports tensor num smaller than 2^31."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t checkTensorPtr( - const void *px, const void *py, const void *p, const void *ans_grad, - const mluOpTensorDescriptor_t opt_boundary_desc, const void *opt_boundary, - const void *px_grad, const void *py_grad, const int S, const int T, - bool &has_boundary) { - if (S > 0) { - PARAM_CHECK(API_NAME, px != nullptr); - PARAM_CHECK(API_NAME, px_grad != nullptr); - } else { - VLOG(5) << API_NAME << " px.shape[1] is zero."; - } - - if (T > 0) { - PARAM_CHECK(API_NAME, py != nullptr); - PARAM_CHECK(API_NAME, py_grad != nullptr); - } else { - VLOG(5) << API_NAME << " py.shape[2] is zero."; - } - - PARAM_CHECK(API_NAME, p != nullptr); - PARAM_CHECK(API_NAME, ans_grad != nullptr); - - if (nullptr != opt_boundary_desc && nullptr != opt_boundary) { - has_boundary = true; - VLOG(5) << API_NAME << " opt_boundary is not NULL."; - - } else if (nullptr == opt_boundary_desc && nullptr == opt_boundary) { - has_boundary = false; - VLOG(5) << API_NAME << " opt_boundary is NULL."; - } else { - LOG(ERROR) << API_NAME - << " opt_boundary_desc and opt_boundary must both be NULL, " - << "or both not be NULL."; - return MLUOP_STATUS_BAD_PARAM; - } - - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t mutualInformationBackwardParamCheck( - mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, const void *px, - const mluOpTensorDescriptor_t py_desc, const void *py, - const mluOpTensorDescriptor_t opt_boundary_desc, const void *opt_boundary, - const mluOpTensorDescriptor_t p_desc, const void *p, - const mluOpTensorDescriptor_t ans_grad_desc, void *ans_grad, - void *workspace, const size_t workspace_size, - const mluOpTensorDescriptor_t px_grad_desc, void *px_grad, - const mluOpTensorDescriptor_t py_grad_desc, void *py_grad, - bool &has_boundary, bool &zero_element) { - // 1. check handle and tensor_desc - PARAM_CHECK(API_NAME, handle != nullptr); - PARAM_CHECK(API_NAME, px_desc != nullptr); - PARAM_CHECK(API_NAME, py_desc != nullptr); - PARAM_CHECK(API_NAME, p_desc != nullptr); - PARAM_CHECK(API_NAME, ans_grad_desc != nullptr); - PARAM_CHECK(API_NAME, px_grad_desc != nullptr); - PARAM_CHECK(API_NAME, py_grad_desc != nullptr); - - // Since the layout of all tensors are ARRAY, so skip check tensor layout - - // 2. check mlu platform - if (handle->arch < 372) { - LOG(ERROR) << API_NAME << " Only mlu300 and above devices are supported." - << " Please check the device version!"; - return MLUOP_STATUS_ARCH_MISMATCH; - } - - // 3. check tensor dim - mluOpStatus_t check_status = - checkTensorDim(px_desc, py_desc, opt_boundary_desc, p_desc, ans_grad_desc, - px_grad_desc, py_grad_desc); - if (MLUOP_STATUS_SUCCESS != check_status) { - return check_status; - } - - // 4. check tensor shape - check_status = checkTensorShape(px_desc, py_desc, opt_boundary_desc, p_desc, - ans_grad_desc, px_grad_desc, py_grad_desc); - if (MLUOP_STATUS_SUCCESS != check_status) { - return check_status; - } - - // 5. check tensor dtype - check_status = - checkTensorDatatype(px_desc, py_desc, opt_boundary_desc, p_desc, - ans_grad_desc, px_grad_desc, py_grad_desc); - if (MLUOP_STATUS_SUCCESS != check_status) { - return check_status; - } - - // 6. check scale limit, for large tensor - check_status = checkTensorScaleLimit(handle, px_desc, py_desc, - opt_boundary_desc, p_desc); - if (MLUOP_STATUS_SUCCESS != check_status) { - return check_status; - } - - const int B = px_desc->dims[0]; - const int S = px_desc->dims[1]; - const int T = py_desc->dims[2]; - - // 7. check zero element. - if (0 == B || (0 == S && 0 == T)) { - zero_element = true; - VLOG(5) << API_NAME << " Skip zero element tensor when px.shape[0] is zero " - << "or px.shape[1] and py.shape[2] are both zero."; - return MLUOP_STATUS_SUCCESS; - } - - // 8 check workspace - if (workspace_size > 0) { - PARAM_CHECK(API_NAME, workspace != nullptr); - } - - // 9. check tensor ptr - check_status = - checkTensorPtr(px, py, p, ans_grad, opt_boundary_desc, opt_boundary, - px_grad, py_grad, S, T, has_boundary); - if (MLUOP_STATUS_SUCCESS != check_status) { - return check_status; - } - - return MLUOP_STATUS_SUCCESS; -} - -static void mutualInformationBackwardGencase( - mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, const void *px, - const mluOpTensorDescriptor_t py_desc, const void *py, - const mluOpTensorDescriptor_t opt_boundary_desc, const void *opt_boundary, - const mluOpTensorDescriptor_t p_desc, const void *p, - const mluOpTensorDescriptor_t ans_grad_desc, void *ans_grad, - const bool overwrite_ans_grad, const mluOpTensorDescriptor_t px_grad_desc, - void *px_grad, const mluOpTensorDescriptor_t py_grad_desc, void *py_grad) { - GEN_CASE_START("mutual_information_backward"); - GEN_CASE_HANDLE(handle); - - GEN_CASE_DATA(true, "px", px, px_desc, -1, 1); - GEN_CASE_DATA(true, "py", py, py_desc, -1, 1); - if (nullptr != opt_boundary) { - GEN_CASE_DATA_REAL(true, "opt_boundary", opt_boundary, opt_boundary_desc); - } - GEN_CASE_DATA(true, "p", p, p_desc, -1, 1); - GEN_CASE_DATA(true, "ans_grad", ans_grad, ans_grad_desc, -1, 1); - GEN_CASE_DATA(false, "ans_grad", ans_grad, ans_grad_desc, -1, 1); - GEN_CASE_DATA(false, "px_grad", px_grad, px_grad_desc, -1, 1); - GEN_CASE_DATA(false, "py_grad", py_grad, py_grad_desc, -1, 1); - - GEN_CASE_OP_PARAM_SINGLE(0, "mutual_information_backward", - "overwrite_ans_grad", overwrite_ans_grad); - GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0); -} - -static void policyFunc3Pipeline(const mluOpHandle_t handle, cnrtDim3_t *k_dim, - cnrtFunctionType_t *k_type, int batch_size) { - int core_num = mluop::runtime::getClusterLimitCapability(handle) * - mluop::runtime::getCoreNumOfEachUnionCapability(handle); - *k_type = CNRT_FUNC_TYPE_BLOCK; - k_dim->x = 1; - k_dim->y = batch_size < core_num ? batch_size : core_num; - k_dim->z = 1; -} - -static mluOpStatus_t launchMutualInformationBackward3PipelineKernel( - mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, const void *px, - const mluOpTensorDescriptor_t py_desc, const void *py, - const bool has_boundary, const void *opt_boundary, const void *p, - const bool overwrite_ans_grad, void *ans_grad, void *px_grad, - void *py_grad) { - const int B = px_desc->dims[0]; - const int S = px_desc->dims[1]; - const int T = py_desc->dims[2]; - - cnrtDim3_t k_dim; - cnrtFunctionType_t k_type; - policyFunc3Pipeline(handle, &k_dim, &k_type, B); - VLOG(5) << "Launch Kernel 3PipelineMutualInformationBackward<<>>"; - CHECK_RETURN( - "[MutualInformationBackward]", - kernel3PipelineMutualInformationBackward( - k_dim, k_type, handle->queue, B, S, T, px, py, has_boundary, - opt_boundary, p, overwrite_ans_grad, ans_grad, px_grad, py_grad)); - - return MLUOP_STATUS_SUCCESS; -} - -// Calculate computing diagonal number of partition mode for default kernel -static void calComputingDiags(const int S, const int T, - int64_t *computing_diag_num, int *s_block_size, - int *t_block_size, int *s_repeat, int *t_repeat, - int *s_remainder, int *t_remainder, - const int mode) { - // If has remainder part, rearrange block size to balance work load - s_repeat[mode] = S / s_block_size[mode]; - s_remainder[mode] = S % s_block_size[mode]; - if (s_remainder[mode] > 0) { - s_block_size[mode] = S / (s_repeat[mode] + 1); - s_repeat[mode] = S / s_block_size[mode]; - s_remainder[mode] = S % s_block_size[mode]; - } - - t_repeat[mode] = T / t_block_size[mode]; - t_remainder[mode] = T % t_block_size[mode]; - if (t_remainder[mode] > 0) { - t_block_size[mode] = T / (t_repeat[mode] + 1); - t_repeat[mode] = T / t_block_size[mode]; - t_remainder[mode] = T % t_block_size[mode]; - } - - // Accumulate all block's computing diagonal numbers - computing_diag_num[mode] = s_repeat[mode] * t_repeat[mode] * - (s_block_size[mode] + t_block_size[mode] - 1); - if (s_remainder[mode] > 0) { - computing_diag_num[mode] += - t_repeat[mode] * (t_block_size[mode] + s_remainder[mode] - 1); - } - - if (t_remainder[mode] > 0) { - computing_diag_num[mode] += - s_repeat[mode] * (s_block_size[mode] + t_remainder[mode] - 1); - } - - if (s_remainder[mode] > 0 && t_remainder[mode] > 0) { - computing_diag_num[mode] += s_remainder[mode] + t_remainder[mode] - 1; - } -} - -static void assignPartitionParams(const int *s_block_size, - const int *t_block_size, const int *s_repeat, - const int *t_repeat, const int *s_remainder, - const int *t_remainder, - int &final_s_block_size, - int &final_t_block_size, int &final_s_repeat, - int &final_t_repeat, int &final_s_remainder, - int &final_t_remainder, const int mode) { - final_s_block_size = s_block_size[mode]; - final_t_block_size = t_block_size[mode]; - final_s_repeat = s_repeat[mode]; - final_t_repeat = t_repeat[mode]; - final_s_remainder = s_remainder[mode]; - final_t_remainder = t_remainder[mode]; -} - -static void calDefaultPartition(const int S, const int T, const int N_size, - const int nram_size, int &job_diag_num, - int &final_s_block_size, - int &final_t_block_size, int &final_s_repeat, - int &final_t_repeat, int &final_s_remainder, - int &final_t_remainder) { - // Compute each partition's job diagonal number, - // and choose the partition method with the least job diagonal number: - // 1) all S and T, no partition, launch once in one batch; - // 2) S < max_N_size, compare with (S, t) and (S/2, t); - // 3) T < max_N_size, compare with (s, T) and (s, T/2); - // 4) both S and T > max_N_size, compare with (N, N), (S, t), (s, T), if - // exist; - if (S <= N_size && T <= N_size) { - // once can compute all SxT onchip - job_diag_num = 1; - final_s_block_size = S; - final_t_block_size = T; - final_s_repeat = 1; - final_t_repeat = 1; - final_s_remainder = 0; - final_t_remainder = 0; - return; - } else { - // Sum of each partition's number of computing diagonals - // at most 3 arrays of candidate partition mode - int mode; - int64_t computing_diag_num[3] = {0}; - int s_block_size[3] = {0}; - int t_block_size[3] = {0}; - int s_repeat[3] = {0}; - int t_repeat[3] = {0}; - int s_remainder[3] = {0}; - int t_remainder[3] = {0}; - - if (S <= N_size && T > N_size) { - // compare with (S, t) and (S/2, t) - // 1) deal_s = S; min(s, t) = s; - mode = 0; - s_block_size[0] = S; - t_block_size[0] = (nram_size / sizeof(float) - 8 * s_block_size[0]) / - (4 * s_block_size[0] + 2); - calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size, - s_repeat, t_repeat, s_remainder, t_remainder, mode); - // 2) deal_s = S/2; min(s, t) = s; - mode = 1; - s_block_size[1] = std::max(S / 2, 1); // at least 1 number in s_block - t_block_size[1] = (nram_size / sizeof(float) - 8 * s_block_size[1]) / - (4 * s_block_size[1] + 2); - calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size, - s_repeat, t_repeat, s_remainder, t_remainder, mode); - - if (computing_diag_num[0] <= computing_diag_num[1]) { - assignPartitionParams( - s_block_size, t_block_size, s_repeat, t_repeat, s_remainder, - t_remainder, final_s_block_size, final_t_block_size, final_s_repeat, - final_t_repeat, final_s_remainder, final_t_remainder, 0); - } else { - assignPartitionParams( - s_block_size, t_block_size, s_repeat, t_repeat, s_remainder, - t_remainder, final_s_block_size, final_t_block_size, final_s_repeat, - final_t_repeat, final_s_remainder, final_t_remainder, 1); - } - } else if (S > N_size && T <= N_size) { - // compare with (s, T) and (s, T/2) - // 1) deal_t = T; min(s, t) = t; - mode = 0; - t_block_size[0] = T; - s_block_size[0] = (nram_size / sizeof(float) - 8 * t_block_size[0]) / - (4 * t_block_size[0] + 2); - calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size, - s_repeat, t_repeat, s_remainder, t_remainder, mode); - // 2) deal_t = T/2; min(s, t) = t; - mode = 1; - t_block_size[1] = std::max(T / 2, 1); // at least 1 number in t_block - s_block_size[1] = (nram_size / sizeof(float) - 8 * t_block_size[1]) / - (4 * t_block_size[1] + 2); - calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size, - s_repeat, t_repeat, s_remainder, t_remainder, mode); - - if (computing_diag_num[0] <= computing_diag_num[1]) { - assignPartitionParams( - s_block_size, t_block_size, s_repeat, t_repeat, s_remainder, - t_remainder, final_s_block_size, final_t_block_size, final_s_repeat, - final_t_repeat, final_s_remainder, final_t_remainder, 0); - } else { - assignPartitionParams( - s_block_size, t_block_size, s_repeat, t_repeat, s_remainder, - t_remainder, final_s_block_size, final_t_block_size, final_s_repeat, - final_t_repeat, final_s_remainder, final_t_remainder, 1); - } - } else { // S > N_size, T > N_size, choose between (N,N), (S,t), (s,T) - // 1) deal_s = deal_t = N_size; min(s,t) = s = t; - mode = 0; - s_block_size[0] = N_size; - t_block_size[0] = N_size; - calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size, - s_repeat, t_repeat, s_remainder, t_remainder, mode); - // 2) deal_s = S, deal_t = t; min(s,t) = t; - mode = 1; - s_block_size[1] = N_size; - t_block_size[1] = (nram_size / sizeof(float) - 2 * s_block_size[1]) / - (4 * s_block_size[1] + 8); - if (t_block_size[1] > 0) { - calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size, - s_repeat, t_repeat, s_remainder, t_remainder, mode); - } else { - computing_diag_num[1] = -1; // not support on this partition - } - // 3) deal_t = T, deal_s = s; min(s,t) = s; - mode = 2; - t_block_size[2] = T; - s_block_size[2] = (nram_size / sizeof(float) - 2 * t_block_size[2]) / - (4 * t_block_size[2] + 8); - if (s_block_size[2] > 0) { - calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size, - s_repeat, t_repeat, s_remainder, t_remainder, mode); - } else { - computing_diag_num[2] = -1; // not support on this partition - } - - if (computing_diag_num[0] > 0 && // mode 0 is valid - ((computing_diag_num[1] <= 0) || // mode 1 is invalid or - computing_diag_num[0] <= - computing_diag_num[1])) { // mode 0 is better than mode 1 - if (computing_diag_num[2] > 0 && // mode 2 is valid and - computing_diag_num[2] < - computing_diag_num[0]) { // mode 2 is better than mode 0 - // choose mode 2 - assignPartitionParams(s_block_size, t_block_size, s_repeat, t_repeat, - s_remainder, t_remainder, final_s_block_size, - final_t_block_size, final_s_repeat, - final_t_repeat, final_s_remainder, - final_t_remainder, 2); - } else { - // choose mode 0 - assignPartitionParams(s_block_size, t_block_size, s_repeat, t_repeat, - s_remainder, t_remainder, final_s_block_size, - final_t_block_size, final_s_repeat, - final_t_repeat, final_s_remainder, - final_t_remainder, 0); - } - } else { // mode 1 is valid and mode 1 is better than mode 0 - if (computing_diag_num[2] > 0 && // mode 2 is valid - computing_diag_num[2] < - computing_diag_num[1]) { // mode 2 is better than mode 1 - // choose mode 2 - assignPartitionParams(s_block_size, t_block_size, s_repeat, t_repeat, - s_remainder, t_remainder, final_s_block_size, - final_t_block_size, final_s_repeat, - final_t_repeat, final_s_remainder, - final_t_remainder, 2); - } else { - // choose mode 1 - assignPartitionParams(s_block_size, t_block_size, s_repeat, t_repeat, - s_remainder, t_remainder, final_s_block_size, - final_t_block_size, final_s_repeat, - final_t_repeat, final_s_remainder, - final_t_remainder, 1); - } - } - } - // total job diagonal number in parallel - job_diag_num = final_s_repeat + (int)(final_s_remainder > 0) + - final_t_repeat + (int)(final_t_remainder > 0) - 1; - } -} - -static mluOpStatus_t launchMutualInformationBackwardDefaultKernel( - mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, const void *px, - const mluOpTensorDescriptor_t py_desc, const void *py, - const bool has_boundary, const void *opt_boundary, const void *p, - const bool overwrite_ans_grad, void *ans_grad, void *px_grad, void *py_grad, - void *p_grad) { - // At first, use Fill Op to set px_grad, py_grad to all 0 - VLOG(5) << API_NAME << " cnnlFill_v3 start."; - uint64_t fill_value = 0x0; - if (mluOpGetTensorElementNum(px_desc) > 0) { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(px_desc, cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value, - cnnl_output_desc, px_grad)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - if (mluOpGetTensorElementNum(py_desc) > 0) { - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(py_desc, cnnl_output_desc); - CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value, - cnnl_output_desc, py_grad)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } - VLOG(5) << API_NAME << " cnnlFill_v3 end."; - - // When S and T is too large, launch default kernel with partition of S and T - // 1. Compute current arch max N size, according to NRAM size and device RAM - // 2. Use max_N_size to calculate different partition mode computing diagonal - // numbers and choose the partition mode, which has the least computing - // diagonal number - // 3. Launch default kernels by diagonal in parallel, with check of MaxDimX - - const int B = px_desc->dims[0]; - const int S = px_desc->dims[1]; - const int T = py_desc->dims[2]; - - cnrtDim3_t k_dim; - cnrtFunctionType_t k_type; - // 1. According to on-chip RAM size, calculate current arch partition block - // size by square, Use max_N_size to partition on S and T dimension RAM space: - // 2*S*T + 2*(S+1)*(T+1) + 2*min(S,T) + 4*min(S,T)+1 - int max_N_size = (int)(std::sqrt(handle->nram_size / sizeof(float) / 4)) - 2; - // Use max square size N, partition on T and S dimension, launch by diagonal: - // -|------T--------| - // :| N1| N2| N3| N4| - // :|---|---|---|---| - // S| N2| N3| N4| N5| - // :|---|---|---|---| - // :| N3| N4| N5| N6| - // -|---------------| - - VLOG(5) << "Current arch Max square N size is " << max_N_size; - - int job_diag_num; // number of default kernel launch steps by diagonal - int s_block_size, t_block_size, s_repeat, t_repeat, s_remainder, t_remainder; - - // 2. Choose the partition mode, which has the least computing diagonal number - // NOTE: p_grad has dimension (S+1, T+1), in function directly use (S, T) - // instead - calDefaultPartition(S + 1, T + 1, max_N_size, handle->nram_size, job_diag_num, - s_block_size, t_block_size, s_repeat, t_repeat, - s_remainder, t_remainder); - int s_block_num = s_repeat + (int)(s_remainder > 0); - int t_block_num = t_repeat + (int)(t_remainder > 0); - int max_s_t_block_num = std::max(s_block_num, t_block_num); - int min_s_t_block_num = std::min(s_block_num, t_block_num); - - k_type = CNRT_FUNC_TYPE_BLOCK; - k_dim.y = 1; - k_dim.z = 1; - // Get current arch support max dim_x value - int task_dim_x_limit; - cnDeviceGetAttribute(&task_dim_x_limit, - CN_DEVICE_ATTRIBUTE_MAX_BLOCK_TASK_DIM_X, - handle->device); - VLOG(5) << "Current arch MAX_BLOCK_TASK_DIM_X is " << task_dim_x_limit; - - // 3. Traverse step_i from 0 to (job_diag_num - 1) - for (int step_i = 0; step_i < job_diag_num; step_i++) { - int job_num_on_step = B * (step_i < max_s_t_block_num - ? std::min(step_i + 1, min_s_t_block_num) - : s_block_num + t_block_num - step_i - 1); - k_dim.x = job_num_on_step; - // Make sure not exceed max dim x limit - if (k_dim.x > task_dim_x_limit) { - int task_dim_change = (k_dim.x + task_dim_x_limit - 1) / task_dim_x_limit; - k_dim.x = (k_dim.x + task_dim_x_limit - 1) / task_dim_change; - k_dim.y = k_dim.y * task_dim_change; - } - - VLOG(5) << "Launch Kernel DefaultMutualInformationBackward<<< step " - << step_i << " of Batch Block: " << k_dim.x << ", " << k_dim.y - << ", " << k_dim.z << ">>>"; - CHECK_RETURN("[MutualInformationBackward]", - kernelDefaultMutualInformationBackward( - k_dim, k_type, handle->queue, B, S, T, step_i, - job_num_on_step, s_block_num, t_block_num, s_block_size, - t_block_size, px, py, has_boundary, opt_boundary, p, - overwrite_ans_grad, ans_grad, px_grad, py_grad, p_grad)); - } - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpMutualInformationBackward( - mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, const void *px, - const mluOpTensorDescriptor_t py_desc, const void *py, - const mluOpTensorDescriptor_t opt_boundary_desc, const void *opt_boundary, - const mluOpTensorDescriptor_t p_desc, const void *p, - const mluOpTensorDescriptor_t ans_grad_desc, void *ans_grad, - const bool overwrite_ans_grad, void *workspace, const size_t workspace_size, - const mluOpTensorDescriptor_t px_grad_desc, void *px_grad, - const mluOpTensorDescriptor_t py_grad_desc, void *py_grad) { - // 1. Paramcheck - bool has_boundary = false; - bool zero_element = false; - mluOpStatus_t check_status = mutualInformationBackwardParamCheck( - handle, px_desc, px, py_desc, py, opt_boundary_desc, opt_boundary, p_desc, - p, ans_grad_desc, ans_grad, workspace, workspace_size, px_grad_desc, - px_grad, py_grad_desc, py_grad, has_boundary, zero_element); - - if (MLUOP_STATUS_SUCCESS != check_status || zero_element) { - return check_status; - } - - // 2. Generate case - if (MLUOP_GEN_CASE_ON_NEW) { - mutualInformationBackwardGencase( - handle, px_desc, px, py_desc, py, opt_boundary_desc, opt_boundary, - p_desc, p, ans_grad_desc, ans_grad, overwrite_ans_grad, px_grad_desc, - px_grad, py_grad_desc, py_grad); - } - - // Choose to launch 3pipeline or default kernel - const int B = px_desc->dims[0]; - const int S = px_desc->dims[1]; - const int T = py_desc->dims[2]; - - bool is_launch_3pipeline = true; - // check 3pipeline scale limit for computing term1 and term2 - int current_size = T * (S + 1) + (T + 1) * S + 5 * (T + 1); - if (current_size > handle->nram_size / sizeof(float)) { - is_launch_3pipeline = false; - } - - // check 3pipeline scale limit for computing p_grad - current_size = - T * (S + 1) + (T + 1) * S + (T + 1) * (S + 1) + 3 * std::min(S, T) + 4; - if (current_size > handle->nram_size / sizeof(float)) { - is_launch_3pipeline = false; - } - - // 3. launch kernel - mluOpStatus_t return_status; - if (is_launch_3pipeline) { - // launch 3pipeline kernel when satisfy scale limit - return_status = launchMutualInformationBackward3PipelineKernel( - handle, px_desc, px, py_desc, py, has_boundary, opt_boundary, p, - overwrite_ans_grad, ans_grad, px_grad, py_grad); - } else { - // launch default kernel, workspace is for p_grad - return_status = launchMutualInformationBackwardDefaultKernel( - handle, px_desc, px, py_desc, py, has_boundary, opt_boundary, p, - overwrite_ans_grad, ans_grad, px_grad, py_grad, workspace); - } - - GEN_CASE_END(); - return return_status; -} diff --git a/kernels/mutual_information_backward/mutual_information_backward.h b/kernels/mutual_information_backward/mutual_information_backward.h deleted file mode 100644 index 5a4a477e6..000000000 --- a/kernels/mutual_information_backward/mutual_information_backward.h +++ /dev/null @@ -1,45 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#ifndef KERNELS_MUTUAL_INFORMATION_BACKWARD_MUTUAL_INFORMATION_BACKWARD_H_ -#define KERNELS_MUTUAL_INFORMATION_BACKWARD_MUTUAL_INFORMATION_BACKWARD_H_ - -#include "mlu_op.h" -#include "kernels/kernel.h" - -mluOpStatus_t MLUOP_WIN_API kernel3PipelineMutualInformationBackward( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, const int B, - const int S, const int T, const void *px, const void *py, - const bool has_boundary, const void *opt_boundary, const void *p, - const bool overwrite_ans_grad, void *ans_grad, void *px_grad, - void *py_grad); - -mluOpStatus_t MLUOP_WIN_API kernelDefaultMutualInformationBackward( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, const int B, - const int S, const int T, const int step_i, const int job_num_on_step, - const int s_block_num, const int t_block_num, const int s_block_size, - const int t_block_size, const void *px, const void *py, - const bool has_boundary, const void *opt_boundary, const void *p, - const bool overwrite_ans_grad, void *ans_grad, void *px_grad, void *py_grad, - void *p_grad); - -#endif // KERNELS_MUTUAL_INFORMATION_BACKWARD_MUTUAL_INFORMATION_BACKWARD_H_ diff --git a/kernels/mutual_information_backward/mutual_information_backward_3pipeline_block.mlu b/kernels/mutual_information_backward/mutual_information_backward_3pipeline_block.mlu deleted file mode 100644 index b55829d11..000000000 --- a/kernels/mutual_information_backward/mutual_information_backward_3pipeline_block.mlu +++ /dev/null @@ -1,289 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "mutual_information_backward.h" - -#include "core/logging.h" -#include "kernels/kernel.h" -#include "kernels/utils/common.h" -#include "kernels/mutual_information_backward/mutual_information_backward_utils.h" - -__mlu_func__ void computeTerm1AndTerm2(const int b, const int S, const int T, - const int s_begin, const int s_end, - const int t_begin, const int t_end, - const float *px, const float *py, - const float *p) { - /* *********************nram space split********************** */ - /* | term1 | term2 | cur_p | next_p | large_neg | mask |*/ - /* | S*(T+1) | (S+1)*T | t_len | t_len | 2*t_len | t_len |*/ - float *nram_term1 = (float *)nram_buffer; - float *nram_term2 = nram_term1 + S * (T + 1); - float *nram_cur_p = nram_term2 + (S + 1) * T; - - int t_len = t_end - t_begin + 1; - - float *nram_next_p = nram_cur_p + t_len; - float *nram_large_neg = nram_next_p + t_len; - float *nram_mask = nram_large_neg + 2 * t_len; - - __bang_write_value(nram_large_neg, 2 * t_len, (float)-1.0e+30); - - for (int i = s_begin; i < s_end; ++i) { - // load p to cur_p and next_p - __memcpy(nram_cur_p, p + b * (S + 1) * (T + 1) + i * (T + 1) + t_begin, - t_len * sizeof(float), GDRAM2NRAM, t_len * sizeof(float), - (T + 1) * sizeof(float), 1); - __bang_nan_maximum(nram_cur_p, nram_cur_p, nram_large_neg, 2 * t_len); - - // load px to term1 - __memcpy(nram_term1 + i * (T + 1) + t_begin, - px + b * S * (T + 1) + i * (T + 1) + t_begin, - t_len * sizeof(float), GDRAM2NRAM); - __bang_fusion(FUSION_FAS, nram_term1 + i * (T + 1) + t_begin, - nram_term1 + i * (T + 1) + t_begin, nram_cur_p, nram_next_p, - t_len, t_len); - safeExp(nram_term1 + i * (T + 1) + t_begin, - nram_term1 + i * (T + 1) + t_begin, nram_mask, t_len); - - if (t_len > 1) { - // load py to term2 - __memcpy(nram_term2 + i * T + t_begin, - py + b * (S + 1) * T + i * T + t_begin, - (t_len - 1) * sizeof(float), GDRAM2NRAM); - __bang_fusion(FUSION_FAS, nram_term2 + i * T + t_begin, - nram_term2 + i * T + t_begin, nram_cur_p, nram_cur_p + 1, - t_len - 1, t_len - 1); - safeExp(nram_term2 + i * T + t_begin, nram_term2 + i * T + t_begin, - nram_mask, t_len - 1); - } - } - - if (t_len > 1) { - if (s_begin == s_end) { - // load p to next_p - __memcpy(nram_next_p, - p + b * (S + 1) * (T + 1) + s_end * (T + 1) + t_begin, - t_len * sizeof(float), GDRAM2NRAM); - __bang_nan_maximum(nram_next_p, nram_next_p, nram_large_neg, t_len); - } - // compute term2[s_end][:] - __memcpy(nram_term2 + s_end * T + t_begin, - py + b * (S + 1) * T + s_end * T + t_begin, - (t_len - 1) * sizeof(float), GDRAM2NRAM); - __bang_fusion(FUSION_FAS, nram_term2 + s_end * T + t_begin, - nram_term2 + s_end * T + t_begin, nram_next_p, - nram_next_p + 1, t_len - 1, t_len - 1); - safeExp(nram_term2 + s_end * T + t_begin, nram_term2 + s_end * T + t_begin, - nram_mask, t_len - 1); - } -} - -__mlu_func__ void computePGrad(const int b, const int S, const int T, - const int s_begin, const int s_end, - const int t_begin, const int t_end, - const bool overwrite_ans_grad, float *ans_grad) { - /* ***************************nram space split*************************** */ - /* | term1 | term2 | p_grad | cur_term1|zero|cur_term2|cur_p_grad| */ - /* | S*(T+1) | (S+1)*T |(S+1)*(T+1)| min_len | 1 | min_len | min_len | */ - float *nram_term1 = (float *)nram_buffer; - float *nram_term2 = nram_term1 + S * (T + 1); - float *nram_p_grad = nram_term2 + (S + 1) * T; - float *nram_cur_term1 = nram_p_grad + (S + 1) * (T + 1); - - int s_len = s_end - s_begin + 1; - int t_len = t_end - t_begin + 1; - int max_len = __mluop_max(s_len, t_len); - int min_len = __mluop_min(s_len, t_len); - - float *nram_cur_term2 = nram_cur_term1 + min_len + 1; - float *nram_cur_p_grad = nram_cur_term2 + min_len; - __bang_write_zero(nram_cur_term1, 3 * min_len + 1); - - // compute the last one: p_grad[b][s_end][t_end] = ans_grad[b] - __memcpy_async(nram_p_grad + s_end * (T + 1) + t_end, ans_grad + b, - sizeof(float), GDRAM2NRAM); - __sync(); - nram_cur_p_grad[0] = nram_p_grad[s_end * (T + 1) + t_end]; - - int data_num = 0; - int s = 0; - int t = 0; - int term2_s = 0; - int term2_t = 0; - int term1_num = 0; - int term2_num = 0; - float *nram_p_grad_for_compute_term1 = nram_cur_p_grad; - float *nram_compute_term2 = nram_cur_term2; - - int loop_time = s_len + t_len - 1; - for (int i = 1; i < loop_time; ++i) { - data_num = i < max_len ? __mluop_min(i + 1, min_len) : loop_time - i; - s = i < s_len ? s_end - i : s_begin; - t = i < s_len ? t_end : t_end + s_len - i - 1; - - term1_num = i < t_len ? data_num - 1 : data_num; - if (term1_num > 0) { - __memcpy(nram_cur_term1, nram_term1 + s * (T + 1) + t, sizeof(float), - NRAM2NRAM, sizeof(float), T * sizeof(float), term1_num - 1); - nram_p_grad_for_compute_term1 = - i >= s_len ? nram_cur_p_grad + 1 : nram_cur_p_grad; - __bang_mul(nram_cur_term1, nram_cur_term1, nram_p_grad_for_compute_term1, - term1_num); - } - - term2_num = data_num; - nram_compute_term2 = nram_cur_term2; - term2_s = s; - term2_t = t; - if (i < s_len) { - term2_num -= 1; - nram_compute_term2 -= 1; - term2_s += 1; - term2_t -= 1; - } - if (term2_num > 0) { - __memcpy(nram_cur_term2, nram_term2 + term2_s * T + term2_t, - sizeof(float), NRAM2NRAM, sizeof(float), (T - 1) * sizeof(float), - term2_num - 1); - __bang_mul(nram_cur_term2, nram_cur_term2, nram_cur_p_grad, term2_num); - } - - __bang_add(nram_cur_p_grad, nram_cur_term1, nram_compute_term2, data_num); - __memcpy(nram_p_grad + s * (T + 1) + t, nram_cur_p_grad, sizeof(float), - NRAM2NRAM, T * sizeof(float), sizeof(float), data_num - 1); - } - - if (overwrite_ans_grad) { - __memcpy(ans_grad + b, nram_p_grad + s_begin * (T + 1) + t_begin, - sizeof(float), NRAM2GDRAM); - } -} - -__mlu_func__ void computePxGradAndPyGrad(const int b, const int S, const int T, - const int s_begin, const int s_end, - const int t_begin, const int t_end, - float *px_grad, float *py_grad) { - /* ***********nram space split********** */ - /* | term1 | term2 | p_grad | */ - /* | S*(T+1) | (S+1)*T | (S+1)*(T+1) | */ - float *nram_term1 = (float *)nram_buffer; - float *nram_term2 = nram_term1 + S * (T + 1); - float *nram_p_grad = nram_term2 + (S + 1) * T; - - int t_len = t_end - t_begin + 1; - - for (int i = s_begin; i < s_end; ++i) { - // compute term1 - __bang_mul(nram_term1 + i * (T + 1) + t_begin, - nram_term1 + i * (T + 1) + t_begin, - nram_p_grad + (i + 1) * (T + 1) + t_begin, t_len); - - if (t_len > 1) { - // compute term2 - __bang_mul(nram_term2 + i * T + t_begin, nram_term2 + i * T + t_begin, - nram_p_grad + i * (T + 1) + t_begin + 1, t_len - 1); - } - } - - if (t_len > 1) { - // compute term2[s_end][:] - __bang_mul(nram_term2 + s_end * T + t_begin, - nram_term2 + s_end * T + t_begin, - nram_p_grad + s_end * (T + 1) + t_begin + 1, t_len - 1); - } - - if (S > 0) { - __memcpy(px_grad + b * S * (T + 1), nram_term1, S * (T + 1) * sizeof(float), - NRAM2GDRAM); - } - if (T > 0) { - __memcpy(py_grad + b * (S + 1) * T, nram_term2, (S + 1) * T * sizeof(float), - NRAM2GDRAM); - } -} - -__mlu_global__ void mluBlock3PipelineMutualInformationBackward( - const int B, const int S, const int T, const float *px, const float *py, - const bool has_boundary, const int64_t *opt_boundary, const float *p, - const bool overwrite_ans_grad, float *ans_grad, float *px_grad, - float *py_grad) { - const int num_per_core = B / taskDim; - const int num_rem = B % taskDim; - const int num_cur_core = num_per_core + (taskId < num_rem); - const int b_offset = taskId * num_cur_core + (taskId >= num_rem) * num_rem; - - int s_begin = 0; - int t_begin = 0; - int s_end = S; - int t_end = T; - if (has_boundary) { - int64_t *boundary = (int64_t *)nram_buffer; - for (int b = b_offset; b < b_offset + num_cur_core; ++b) { - __memcpy(boundary, opt_boundary + 4 * b, 4 * sizeof(int64_t), GDRAM2NRAM); - s_begin = boundary[0]; - t_begin = boundary[1]; - s_end = boundary[2]; - t_end = boundary[3]; - __bang_write_zero((float *)nram_buffer, S * (T + 1) + (S + 1) * T); - - if (s_begin > s_end || t_begin > t_end) { - if (S > 0) { - __memcpy(px_grad + b * S * (T + 1), (float *)nram_buffer, - S * (T + 1) * sizeof(float), NRAM2GDRAM); - } - if (T > 0) { - __memcpy(py_grad + b * (S + 1) * T, - (float *)nram_buffer + S * (T + 1), - (S + 1) * T * sizeof(float), NRAM2GDRAM); - } - continue; - } - computeTerm1AndTerm2(b, S, T, s_begin, s_end, t_begin, t_end, px, py, p); - computePGrad(b, S, T, s_begin, s_end, t_begin, t_end, overwrite_ans_grad, - ans_grad); - computePxGradAndPyGrad(b, S, T, s_begin, s_end, t_begin, t_end, px_grad, - py_grad); - } - } else { - for (int b = b_offset; b < b_offset + num_cur_core; ++b) { - computeTerm1AndTerm2(b, S, T, s_begin, s_end, t_begin, t_end, px, py, p); - computePGrad(b, S, T, s_begin, s_end, t_begin, t_end, overwrite_ans_grad, - ans_grad); - computePxGradAndPyGrad(b, S, T, s_begin, s_end, t_begin, t_end, px_grad, - py_grad); - } - } -} - -mluOpStatus_t MLUOP_WIN_API kernel3PipelineMutualInformationBackward( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, const int B, - const int S, const int T, const void *px, const void *py, - const bool has_boundary, const void *opt_boundary, const void *p, - const bool overwrite_ans_grad, void *ans_grad, void *px_grad, - void *py_grad) { - KERNEL_CHECK( - mluBlock3PipelineMutualInformationBackward<<>>( - B, S, T, (float *)px, (float *)py, has_boundary, - (int64_t *)opt_boundary, (float *)p, overwrite_ans_grad, - (float *)ans_grad, (float *)px_grad, (float *)py_grad)); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/mutual_information_backward/mutual_information_backward_default_block.mlu b/kernels/mutual_information_backward/mutual_information_backward_default_block.mlu deleted file mode 100644 index 3f3a5e3f2..000000000 --- a/kernels/mutual_information_backward/mutual_information_backward_default_block.mlu +++ /dev/null @@ -1,455 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "mutual_information_backward.h" - -#include "core/logging.h" -#include "kernels/kernel.h" -#include "kernels/utils/common.h" -#include "kernels/mutual_information_backward/mutual_information_backward_utils.h" - -__mlu_func__ bool calPartitionJobScope( - bool has_boundary, const int64_t *opt_boundary, const int B, const int S, - const int T, const int step_i, const int job_num_on_step, - const int s_block_num, const int t_block_num, const int s_block_size, - const int t_block_size, int &batch_idx, int &batch_s_begin, - int &batch_t_begin, int &batch_s_end, int &batch_t_end, int &cur_s_begin, - int &cur_t_begin, int &cur_s_end, int &cur_t_end, int &cur_s_size, - int &cur_t_size, bool &need_compute_ans_grad, bool overwrite_ans_grad, - float *px_grad, float *py_grad) { - int job_num_on_batch = job_num_on_step / B; // Each batch job num - batch_idx = taskId / job_num_on_batch; // Current job on which batch - int block_id_in_batch = - taskId - batch_idx * job_num_on_batch; // Current job id in batch - - // taskDim is not always job num, because of TASK_DIM_X limit - if (batch_idx >= B) { - return true; - } - - // Compute s and t block id in batch - int s_block_id, t_block_id; - s_block_id = __mluop_max(0, s_block_num - 1 - step_i) + block_id_in_batch; - t_block_id = - __mluop_min(t_block_num - 1, s_block_num + t_block_num - 2 - step_i) - - block_id_in_batch; - - // Compute current job id scope - cur_s_begin = s_block_id * s_block_size; - cur_t_begin = t_block_id * t_block_size; - cur_s_end = (s_block_id + 1) * s_block_size - 1; - cur_t_end = (t_block_id + 1) * t_block_size - 1; - - // Deal with boundary and decide current job if need to compute - if (has_boundary) { - int64_t *boundary = (int64_t *)nram_buffer; - __memcpy(boundary, opt_boundary + 4 * batch_idx, 4 * sizeof(int64_t), - GDRAM2NRAM); - batch_s_begin = boundary[0]; - batch_t_begin = boundary[1]; - batch_s_end = boundary[2]; - batch_t_end = boundary[3]; - // invalid boundary, already use cnnlFill to set px_grad and py_grad to 0 - if (batch_s_begin > batch_s_end || batch_t_begin > batch_t_end) { - return true; - } - } - - // Compare current job scope with batch scope, if empty job, return - if (cur_s_begin > batch_s_end || cur_t_begin > batch_t_end || - cur_s_end < batch_s_begin || cur_t_end < batch_t_begin) { - return true; - } - - // Reset s and t begin and end to valid boundary - if (cur_s_begin < batch_s_begin) { - cur_s_begin = batch_s_begin; - } - if (cur_t_begin < batch_t_begin) { - cur_t_begin = batch_t_begin; - } - if (cur_s_end > batch_s_end) { - cur_s_end = batch_s_end; - } - if (cur_t_end > batch_t_end) { - cur_t_end = batch_t_end; - } - - cur_s_size = cur_s_end - cur_s_begin + 1; - cur_t_size = cur_t_end - cur_t_begin + 1; - - // At last compute step and overwrite, need to memcpy back to ans_grad - if (overwrite_ans_grad && cur_s_begin == batch_s_begin && - cur_t_begin == batch_t_begin) { - need_compute_ans_grad = true; - } else { - need_compute_ans_grad = false; - } - - return false; -} - -__mlu_func__ void loadInit(const float *gdram_px, const float *gdram_py, - const float *gdram_p, float *gdram_p_grad, - float *nram_px, float *nram_py, float *nram_p, - float *nram_p_grad, const int S, const int T, - const int batch_s_end, const int batch_t_end, - const int cur_s_begin, const int cur_t_begin, - const int cur_s_end, const int cur_t_end, - const int cur_s_size, const int cur_t_size) { - // Load p(s, t) - __memcpy_async(nram_p, gdram_p + cur_s_begin * (T + 1) + cur_t_begin, - cur_t_size * sizeof(float), GDRAM2NRAM, - (cur_t_size + 1) * sizeof(float), (T + 1) * sizeof(float), - cur_s_size - 1); - - // Compare current s_end and batch_s_end to decide: - // load px or write -inf, load p or write large_neg, load p_grad or write 0 - if (cur_s_end < batch_s_end) { - // Load px(s, t) - __memcpy_async(nram_px, gdram_px + cur_s_begin * (T + 1) + cur_t_begin, - cur_t_size * sizeof(float), GDRAM2NRAM, - cur_t_size * sizeof(float), (T + 1) * sizeof(float), - cur_s_size - 1); - // Load p(s+1, t), one row - __memcpy_async(nram_p + cur_s_size * (cur_t_size + 1), - gdram_p + (cur_s_end + 1) * (T + 1) + cur_t_begin, - cur_t_size * sizeof(float), GDRAM2NRAM, 0, 0, 0); - // load p_grad(s+1, t), one row - __memcpy_async(nram_p_grad + cur_s_size * (cur_t_size + 1), - gdram_p_grad + (cur_s_end + 1) * (T + 1) + cur_t_begin, - cur_t_size * sizeof(float), GDRAM2NRAM, 0, 0, 0); - } else { // cur_s_end == batch_s_end, skip last row, write value - if (cur_s_size > 1) { - __memcpy_async(nram_px, gdram_px + cur_s_begin * (T + 1) + cur_t_begin, - cur_t_size * sizeof(float), GDRAM2NRAM, - cur_t_size * sizeof(float), (T + 1) * sizeof(float), - cur_s_size - 2); - } - // write -inf at px last row - __nramset_async(nram_px + (cur_s_size - 1) * cur_t_size, cur_t_size, - (float)(-INFINITY), 0, 0); - // write large_neg at p last row - __nramset_async(nram_p + cur_s_size * (cur_t_size + 1), cur_t_size, - (float)-1.0e+30, 0, 0); - // write 0 at p_grad last row - __nramset_async(nram_p_grad + cur_s_size * (cur_t_size + 1), cur_t_size, - (float)0.0, 0, 0); - } - - // Compare current t_end and batch_t_end to decide: - // load py or write -inf, load p or write large_neg, load p_grad or write 0 - if (cur_t_end < batch_t_end) { - // Load py(s, t) - __memcpy_async(nram_py, gdram_py + cur_s_begin * T + cur_t_begin, - cur_t_size * sizeof(float), GDRAM2NRAM, - cur_t_size * sizeof(float), T * sizeof(float), - cur_s_size - 1); - // Load p(s, t+1), one column - __memcpy_async(nram_p + cur_t_size, - gdram_p + cur_s_begin * (T + 1) + cur_t_end + 1, - sizeof(float), GDRAM2NRAM, (cur_t_size + 1) * sizeof(float), - (T + 1) * sizeof(float), cur_s_size - 1); - // Load p_grad(s, t+1), one column - __memcpy_async(nram_p_grad + cur_t_size, - gdram_p_grad + cur_s_begin * (T + 1) + cur_t_end + 1, - sizeof(float), GDRAM2NRAM, (cur_t_size + 1) * sizeof(float), - (T + 1) * sizeof(float), cur_s_size - 1); - } else { // cur_t_end == batch_t_end, skip last column, write value - // Load py(s, t) - if (cur_t_size > 1) { - __memcpy_async(nram_py, gdram_py + cur_s_begin * T + cur_t_begin, - (cur_t_size - 1) * sizeof(float), GDRAM2NRAM, - cur_t_size * sizeof(float), T * sizeof(float), - cur_s_size - 1); - } - // write -inf at py last column - __nramset_async(nram_py + cur_t_size - 1, 1, (float)(-INFINITY), - cur_t_size * sizeof(float), cur_s_size - 1); - // write large_neg at p last column - __nramset_async(nram_p + cur_t_size, 1, (float)-1.0e+30, - (cur_t_size + 1) * sizeof(float), cur_s_size - 1); - // write 0 at p_grad last column - __nramset_async(nram_p_grad + cur_t_size, 1, (float)0.0, - (cur_t_size + 1) * sizeof(float), cur_s_size - 1); - } -} - -__mlu_func__ void computeByDiagonal( - float *nram_px, float *nram_py, float *nram_p, float *nram_p_grad, - float *nram_cur_px, float *nram_cur_py, float *nram_cur_p, - float *nram_next_p, float *nram_large_neg, float *nram_mask, - float *gdram_ans_grad, const int batch_s_end, const int batch_t_end, - const int cur_s_end, const int cur_t_end, const int cur_s_size, - const int cur_t_size) { - const int repeat = cur_s_size + cur_t_size - 1; - const int max_s_t = __mluop_max(cur_s_size, cur_t_size); - const int min_s_t = __mluop_min(cur_s_size, cur_t_size); - - for (int i = 0; i < repeat; ++i) { - int data_num = i < max_s_t ? __mluop_min(i + 1, min_s_t) - : cur_s_size + cur_t_size - i - 1; - - // px, py use same s, t index on nram, - int first_s = __mluop_max(0, cur_s_size - 1 - i); - int first_t = __mluop_min(cur_t_size - 1, cur_s_size + cur_t_size - 2 - i); - - // memcpy_async cur_px, cur_py, - // memcpy cur_p(same index, data_num), next_p(next index, data_num+1) - __memcpy(nram_cur_p, nram_p + first_s * (cur_t_size + 1) + first_t, - sizeof(float), NRAM2NRAM, sizeof(float), - cur_t_size * sizeof(float), data_num - 1); - __memcpy(nram_next_p, nram_p + first_s * (cur_t_size + 1) + first_t + 1, - sizeof(float), NRAM2NRAM, sizeof(float), - cur_t_size * sizeof(float), data_num); - __memcpy_async(nram_cur_px, nram_px + first_s * cur_t_size + first_t, - sizeof(float), NRAM2NRAM, sizeof(float), - (cur_t_size - 1) * sizeof(float), data_num - 1); - __memcpy_async(nram_cur_py, nram_py + first_s * cur_t_size + first_t, - sizeof(float), NRAM2NRAM, sizeof(float), - (cur_t_size - 1) * sizeof(float), data_num - 1); - - // make cur_p and next_p number < -1.0e+30 to -1.0e+30 - __bang_nan_maximum(nram_cur_p, nram_cur_p, nram_large_neg, data_num); - __bang_nan_maximum(nram_next_p, nram_next_p, nram_large_neg, data_num + 1); - - // sync for cur_px and cur_py - __sync(); - - // Compute term1 and term2, reuse cur_px, cur_py RAM - // cur_term1(s, t) = exp(cur_p(s, t) + cur_px(s, t) - next_p(s + 1, t)); - __bang_fusion(FUSION_FAS, nram_cur_px, nram_cur_px, nram_cur_p, - nram_next_p + 1, data_num, data_num); - // cur_term2(s, t) = exp(cur_p(s, t) + cur_py(s, t) - next_p(s, t + 1)); - __bang_fusion(FUSION_FAS, nram_cur_py, nram_cur_py, nram_cur_p, nram_next_p, - data_num, data_num); - - // sync for next_p - __sync(); - // memcpy_async next_p_grad to nram_next_p - __memcpy_async(nram_next_p, - nram_p_grad + first_s * (cur_t_size + 1) + first_t + 1, - sizeof(float), NRAM2NRAM, sizeof(float), - cur_t_size * sizeof(float), data_num); - - // safeExp for term1 and term2 - safeExp(nram_cur_px, nram_cur_px, nram_mask, data_num); - safeExp(nram_cur_py, nram_cur_py, nram_mask, data_num); - - // sync for next_p_grad - __sync(); - - // Compute px_grad and py_grad - // cur_px_grad = cur_term1 * next_p_grad(s + 1, t) - __bang_mul(nram_cur_px, nram_cur_px, nram_next_p + 1, data_num); - // cur_py_grad = cur_term2 * next_p_grad(s, t + 1) - __bang_mul(nram_cur_py, nram_cur_py, nram_next_p, data_num); - - // sync for cur_px_grad and cur_py_grad - __sync(); - - // memcpy_async back to px_grad, py_grad - __memcpy_async(nram_px + first_s * cur_t_size + first_t, nram_cur_px, - sizeof(float), NRAM2NRAM, (cur_t_size - 1) * sizeof(float), - sizeof(float), data_num - 1); - __memcpy_async(nram_py + first_s * cur_t_size + first_t, nram_cur_py, - sizeof(float), NRAM2NRAM, (cur_t_size - 1) * sizeof(float), - sizeof(float), data_num - 1); - - // Compute p_grad - if (cur_s_end == batch_s_end && cur_t_end == batch_t_end && i == 0) { - // step 0, Initialize p_grad[s_end][t_end] = ans_grad[b] - __memcpy(nram_p_grad + first_s * (cur_t_size + 1) + first_t, - gdram_ans_grad, sizeof(float), GDRAM2NRAM); - } else { - // otherwise, need to compute cur_p_grad: - // cur_p_grad(cur_p) = cur_px_grad + cur_py_grad - __bang_add(nram_cur_p, nram_cur_px, nram_cur_py, data_num); - // memcpy back to p_grad - __memcpy(nram_p_grad + first_s * (cur_t_size + 1) + first_t, nram_cur_p, - sizeof(float), NRAM2NRAM, cur_t_size * sizeof(float), - sizeof(float), data_num - 1); - } - } -} - -__mlu_global__ void mluBlockDefaultMutualInformationBackward( - const int B, const int S, const int T, const int step_i, - const int job_num_on_step, const int s_block_num, const int t_block_num, - const int s_block_size, const int t_block_size, const float *px, - const float *py, const bool has_boundary, const int64_t *opt_boundary, - const float *p, const bool overwrite_ans_grad, float *ans_grad, - float *px_grad, float *py_grad, float *p_grad) { - /******************************** NRAM SPACE ******************************/ - /* Load Init */ - /*|---------------------------------------------------------------------|*/ - /*| px,py | p, p_grad |large_neg | | | |*/ - /*| 2*S*T |2*(S+1)*(T+1)| 2*min_len+1 | min_len | min_len | 2*min_len+1 |*/ - /*|---------------------------------------------------------------------|*/ - /* Compute term1 and term2 */ - /*|------------------------------------------------------------------|*/ - /*| px,py | p |large_neg,mask|cur_term1,2| cur_p | next_p |*/ - /*| 2*S*T |2*(S+1)*(T+1)| 2*min_len+1 | 2*min_len |min_len|min_len+1|*/ - /*|------------------------------------------------------------------|*/ - /* Compute px_grad, py_grad, p_grad */ - /*|------------------------------------------------------------------------|*/ - /*|px/y_grad| p_grad | | cur_term1,2 |cur_p_grad|next_p_grad|*/ - /*| | | |cur_px/y_grad| | |*/ - /*| 2*S*T |2*(S+1)*(T+1)|2*min_len+1| 2*min_len | min_len | min_len+1 |*/ - /*|------------------------------------------------------------------------|*/ - - // NOTE: s and t block size has already + 1 on S and T - int min_s_t_block_size = __mluop_min(s_block_size, t_block_size); - - // px, term1, px_grad - float *nram_px_buf = (float *)nram_buffer; - // py, term2, py_grad - float *nram_py_buf = nram_px_buf + s_block_size * t_block_size; - // p block - float *nram_p = nram_py_buf + s_block_size * t_block_size; - // p_grad block - float *nram_p_grad = nram_p + (s_block_size + 1) * (t_block_size + 1); - // Initialize with float(1.0e+30) value, to maximum with p - float *nram_large_neg = nram_p_grad + (s_block_size + 1) * (t_block_size + 1); - // mask - float *nram_mask = nram_large_neg + min_s_t_block_size + 1; - // cur_px, cur_term1, cur_px_grad - float *nram_cur_px_buf = nram_mask + min_s_t_block_size; - // cur_py, cur_term2, cur_py_grad - float *nram_cur_py_buf = nram_cur_px_buf + min_s_t_block_size; - // cur_p, cur_p_grad - float *nram_cur_p = nram_cur_py_buf + min_s_t_block_size; - // next_p, next_p_grad - float *nram_next_p = nram_cur_p + min_s_t_block_size; - - int batch_idx; - int batch_s_begin = 0; - int batch_t_begin = 0; - int batch_s_end = S; - int batch_t_end = T; - int cur_s_begin, cur_t_begin, cur_s_end, cur_t_end, cur_s_size, cur_t_size; - bool need_compute_ans_grad; - - // According to has_boundary, calculate current job scope - bool need_return = calPartitionJobScope( - has_boundary, opt_boundary, B, S, T, step_i, job_num_on_step, s_block_num, - t_block_num, s_block_size, t_block_size, batch_idx, batch_s_begin, - batch_t_begin, batch_s_end, batch_t_end, cur_s_begin, cur_t_begin, - cur_s_end, cur_t_end, cur_s_size, cur_t_size, need_compute_ans_grad, - overwrite_ans_grad, px_grad, py_grad); - // Because taskDimX could change to taskDimY, so not all jobs need to compute - if (need_return) { - return; - } - - // px_grad and px, py_grad and py, p_grad and p, have the same shape - const int px_one_batch_num = S * (T + 1); - const int py_one_batch_num = (S + 1) * T; - const int p_one_batch_num = (S + 1) * (T + 1); - - const float *gdram_px = px + batch_idx * px_one_batch_num; - const float *gdram_py = py + batch_idx * py_one_batch_num; - const float *gdram_p = p + batch_idx * p_one_batch_num; - - float *gdram_px_grad = px_grad + batch_idx * px_one_batch_num; - float *gdram_py_grad = py_grad + batch_idx * py_one_batch_num; - float *gdram_p_grad = p_grad + batch_idx * p_one_batch_num; - float *gdram_ans_grad = ans_grad + batch_idx; - - const int min_s_t = __mluop_min(cur_s_size, cur_t_size); - // loadInit: load px, py, other block p, - // or write -inf at last row of px, last column of py, - // write large_neg at last row and column of p, - // load other block p_grad, - // or write 0 at last row/column of p_grad - loadInit(gdram_px, gdram_py, gdram_p, gdram_p_grad, nram_px_buf, nram_py_buf, - nram_p, nram_p_grad, S, T, batch_s_end, batch_t_end, cur_s_begin, - cur_t_begin, cur_s_end, cur_t_end, cur_s_size, cur_t_size); - - // Initialize large_neg with value -1e+30 - __nramset_async(nram_large_neg, min_s_t + 1, (float)-1.0e+30, 0, 0); - // sync for initialization async instructions - __sync(); - - // Compute term1, term2, p_grad, px_grad, py_grad - computeByDiagonal(nram_px_buf, nram_py_buf, nram_p, nram_p_grad, - nram_cur_px_buf, nram_cur_py_buf, nram_cur_p, nram_next_p, - nram_large_neg, nram_mask, gdram_ans_grad, batch_s_end, - batch_t_end, cur_s_end, cur_t_end, cur_s_size, cur_t_size); - - // Store: - // memcpy back p_grad(workspace) - __memcpy(gdram_p_grad + cur_s_begin * (T + 1) + cur_t_begin, nram_p_grad, - cur_t_size * sizeof(float), NRAM2GDRAM, (T + 1) * sizeof(float), - (cur_t_size + 1) * sizeof(float), cur_s_size - 1); - // memcpy back px_grad - if (cur_s_end < batch_s_end) { - // memcpy all px_grad data back - __memcpy(gdram_px_grad + cur_s_begin * (T + 1) + cur_t_begin, nram_px_buf, - cur_t_size * sizeof(float), NRAM2GDRAM, (T + 1) * sizeof(float), - cur_t_size * sizeof(float), cur_s_size - 1); - } else { - // memcpy px_grad data except last row - if (cur_s_size > 1) { - __memcpy(gdram_px_grad + cur_s_begin * (T + 1) + cur_t_begin, nram_px_buf, - cur_t_size * sizeof(float), NRAM2GDRAM, (T + 1) * sizeof(float), - cur_t_size * sizeof(float), cur_s_size - 2); - } - } - // memcpy back py_grad - if (cur_t_end < batch_t_end) { - // memcpy all py_grad data back - __memcpy(gdram_py_grad + cur_s_begin * T + cur_t_begin, nram_py_buf, - cur_t_size * sizeof(float), NRAM2GDRAM, T * sizeof(float), - cur_t_size * sizeof(float), cur_s_size - 1); - } else { - // memcpy py_grad data except last column - if (cur_t_size > 1) { - __memcpy(gdram_py_grad + cur_s_begin * T + cur_t_begin, nram_py_buf, - (cur_t_size - 1) * sizeof(float), NRAM2GDRAM, T * sizeof(float), - cur_t_size * sizeof(float), cur_s_size - 1); - } - } - - // If last compute step, need store p_grad[s_begin, t_begin] to ans_grad - if (need_compute_ans_grad) { - ans_grad[batch_idx] = nram_p_grad[0]; - } -} - -mluOpStatus_t MLUOP_WIN_API kernelDefaultMutualInformationBackward( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, const int B, - const int S, const int T, const int step_i, const int job_num_on_step, - const int s_block_num, const int t_block_num, const int s_block_size, - const int t_block_size, const void *px, const void *py, - const bool has_boundary, const void *opt_boundary, const void *p, - const bool overwrite_ans_grad, void *ans_grad, void *px_grad, void *py_grad, - void *p_grad) { - KERNEL_CHECK( - mluBlockDefaultMutualInformationBackward<<>>( - B, S, T, step_i, job_num_on_step, s_block_num, t_block_num, - s_block_size, t_block_size, (float *)px, (float *)py, has_boundary, - (int64_t *)opt_boundary, (float *)p, overwrite_ans_grad, - (float *)ans_grad, (float *)px_grad, (float *)py_grad, - (float *)p_grad)); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/mutual_information_backward/mutual_information_backward_utils.h b/kernels/mutual_information_backward/mutual_information_backward_utils.h deleted file mode 100644 index 19bbe306c..000000000 --- a/kernels/mutual_information_backward/mutual_information_backward_utils.h +++ /dev/null @@ -1,49 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#ifndef KERNELS_MUTUAL_INFORMATION_BACKWARD_MUTUAL_INFORMATION_BACKWARD_UTILS_H_ -#define KERNELS_MUTUAL_INFORMATION_BACKWARD_MUTUAL_INFORMATION_BACKWARD_UTILS_H_ - -#include "mlu_op.h" - -__nram__ char nram_buffer[MAX_NRAM_SIZE]; - -__mlu_func__ void setNanInfToZero(float *src, float *mask, const int num) { - // band with 0x7F800000, exp bits are not all 1, mask -> 0xffffffff - __asm__ volatile( - "fuse.nram.s32 [%[dst]], %[size], [%[src0]]," - ".and(%[src1]), .ne(%[src2]), .mul(%[src3]);\n" ::[dst] "r"( - (int32_t *)mask), - [ size ] "r"(num), [ src0 ] "r"((int32_t *)src), [ src1 ] "r"(0x7f800000), - [ src2 ] "r"(0x7f800000), [ src3 ] "r"(-1)); - __bang_band((char *)src, (char *)src, (char *)mask, num * sizeof(float)); -} - -__mlu_func__ void safeExp(float *dst, float *src, float *mask, const int num) { - setNanInfToZero(src, mask, num); - __mluop_exp(dst, src, NULL, 0, num); - // erase exp(0) to 0 with mask - __bang_band((char *)dst, (char *)dst, (char *)mask, num * sizeof(float)); - setNanInfToZero(dst, mask, num); -} - -#endif // KERNELS_MUTUAL_INFORMATION_BACKWARD_MUTUAL_INFORMATION_BACKWARD_UTILS_H_ // NOLINT diff --git a/kernels/mutual_information_forward/mutual_information_forward.cpp b/kernels/mutual_information_forward/mutual_information_forward.cpp deleted file mode 100644 index 8b50f6164..000000000 --- a/kernels/mutual_information_forward/mutual_information_forward.cpp +++ /dev/null @@ -1,741 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "mutual_information_forward.h" - -#include -#include - -#include "core/context.h" -#include "core/gen_case.h" -#include "core/logging.h" -#include "core/runtime/device.h" -#include "core/tensor.h" -#include "core/type.h" - -#define API_NAME "[mluOpMutualInformationForward]" - -mluOpStatus_t MLUOP_WIN_API mluOpGetMutualInformationForwardWorkspaceSize( - mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, - const mluOpTensorDescriptor_t py_desc, - const mluOpTensorDescriptor_t opt_boundary_desc, - const mluOpTensorDescriptor_t p_desc, - const mluOpTensorDescriptor_t ans_desc, size_t *workspace_size) { - PARAM_CHECK(API_NAME, handle != nullptr); - PARAM_CHECK(API_NAME, px_desc != nullptr); - PARAM_CHECK(API_NAME, py_desc != nullptr); - PARAM_CHECK(API_NAME, p_desc != nullptr); - PARAM_CHECK(API_NAME, ans_desc != nullptr); - PARAM_CHECK(API_NAME, workspace_size != nullptr); - // Workspace is not required in the current implementation. - *workspace_size = 0; - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t checkTensorDim( - const mluOpTensorDescriptor_t px_desc, - const mluOpTensorDescriptor_t py_desc, - const mluOpTensorDescriptor_t opt_boundary_desc, - const mluOpTensorDescriptor_t p_desc, - const mluOpTensorDescriptor_t ans_desc) { - if (3 != px_desc->dim) { - LOG(ERROR) << API_NAME << " The dim of px must be 3. " - << "But now the dim of px is " << px_desc->dim << "."; - return MLUOP_STATUS_BAD_PARAM; - } - if (3 != py_desc->dim) { - LOG(ERROR) << API_NAME << " The dim of py must be 3. " - << "But now the dim of py is " << py_desc->dim << "."; - return MLUOP_STATUS_BAD_PARAM; - } - if (nullptr != opt_boundary_desc && 2 != opt_boundary_desc->dim) { - LOG(ERROR) << API_NAME - << " The dim of opt_boundary must be 2 when opt_boundary is " - << "not NULL. But now the dim of opt_boundary is " - << opt_boundary_desc->dim << "."; - return MLUOP_STATUS_BAD_PARAM; - } - if (3 != p_desc->dim) { - LOG(ERROR) << API_NAME << " The dim of p must be 3. " - << "But now the dim of p is " << p_desc->dim << "."; - return MLUOP_STATUS_BAD_PARAM; - } - if (1 != ans_desc->dim) { - LOG(ERROR) << API_NAME << " The dim of ans must be 1. " - << "But now the dim of ans is " << ans_desc->dim << "."; - return MLUOP_STATUS_BAD_PARAM; - } - - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t checkTensorShape( - const mluOpTensorDescriptor_t px_desc, - const mluOpTensorDescriptor_t py_desc, - const mluOpTensorDescriptor_t opt_boundary_desc, - const mluOpTensorDescriptor_t p_desc, - const mluOpTensorDescriptor_t ans_desc) { - const int B = px_desc->dims[0]; - const int S = px_desc->dims[1]; - const int T = py_desc->dims[2]; - if (B != py_desc->dims[0] || B != p_desc->dims[0] || B != ans_desc->dims[0]) { - LOG(ERROR) << API_NAME - << " px.shape[0], py.shape[0], p.shape[0], ans.shape[0], " - << "must be same. But now " - << "px.shape[0] is " << px_desc->dims[0] << ", py.shape[0] is " - << py_desc->dims[0] << ", p.shape[0] is " << p_desc->dims[0] - << ", ans.shape[0] is " << ans_desc->dims[0] << "."; - return MLUOP_STATUS_BAD_PARAM; - } - - // Currently only supports !modified, so the shape of px must be [B, S, T+1] - if (T + 1 != px_desc->dims[2]) { - LOG(ERROR) << API_NAME << " Currently only supports the case that " - << "px.shape[2] must be equal to py.shape[2] + 1. But now " - << "px.shape[2] is " << px_desc->dims[2] << ", py.shape[2] is " - << py_desc->dims[2] << "."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - - // The shape of py must be [B, S+1, T] - if (S + 1 != py_desc->dims[1]) { - LOG(ERROR) << API_NAME << " py.shape[1] must be equal to px.shape[1] + 1. " - << "But now px.shape[1] is " << px_desc->dims[1] - << ", py.shape[1] is " << py_desc->dims[1] << "."; - return MLUOP_STATUS_BAD_PARAM; - } - - // The shape of opt_boundary must be [B, 4] - if (nullptr != opt_boundary_desc && - (B != opt_boundary_desc->dims[0] || 4 != opt_boundary_desc->dims[1])) { - LOG(ERROR) << API_NAME << " When opt_boundary is not NULL, " - << "opt_boundary.shape[0] and px.shape[0] must be same, and " - << "opt_boundary.shape[1] must be 4. But now " - << "px.shape[0] is " << px_desc->dims[0] - << ", opt_boundary.shape[0] is " << opt_boundary_desc->dims[0] - << ", opt_boundary.shape[1] is " << opt_boundary_desc->dims[1] - << "."; - return MLUOP_STATUS_BAD_PARAM; - } - - // The shape of p must be [B, S+1, T+1] - if (S + 1 != p_desc->dims[1] || T + 1 != p_desc->dims[2]) { - LOG(ERROR) << API_NAME << " p.shape[1] and py.shape[1] must be same, and " - << "p.shape[2] must be equal to py.shape[2] + 1. " - << "But now p.shape[1] is " << p_desc->dims[1] - << ", py.shape[1] is " << py_desc->dims[1] << ", p.shape[2] is " - << p_desc->dims[2] << ", py.shape[2] is " << py_desc->dims[2] - << "."; - return MLUOP_STATUS_BAD_PARAM; - } - - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t checkTensorDatatype( - const mluOpTensorDescriptor_t px_desc, - const mluOpTensorDescriptor_t py_desc, - const mluOpTensorDescriptor_t opt_boundary_desc, - const mluOpTensorDescriptor_t p_desc, - const mluOpTensorDescriptor_t ans_desc) { - if (MLUOP_DTYPE_FLOAT != px_desc->dtype) { - LOG(ERROR) << API_NAME - << "The data type of px currently only support float. But now " - << "the data type of px is " - << mluOpGetNameOfDataType(px_desc->dtype) << "."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - if (MLUOP_DTYPE_FLOAT != py_desc->dtype) { - LOG(ERROR) << API_NAME - << "The data type of py currently only support float. But now " - << "the data type of py is " - << mluOpGetNameOfDataType(py_desc->dtype) << "."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - if (nullptr != opt_boundary_desc && - MLUOP_DTYPE_INT64 != opt_boundary_desc->dtype) { - LOG(ERROR) << API_NAME - << "The data type of opt_boundary currently only support int64." - << " But now the data type of opt_boundary is " - << mluOpGetNameOfDataType(opt_boundary_desc->dtype) << "."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - if (MLUOP_DTYPE_FLOAT != p_desc->dtype) { - LOG(ERROR) << API_NAME - << "The data type of p currently only support float. But now " - << "the data type of p is " - << mluOpGetNameOfDataType(p_desc->dtype) << "."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - if (MLUOP_DTYPE_FLOAT != ans_desc->dtype) { - LOG(ERROR) << API_NAME - << "The data type of ans currently only support float. " - << "But now the data type of ans is " - << mluOpGetNameOfDataType(ans_desc->dtype) << "."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t checkTensorScaleLimit( - mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, - const mluOpTensorDescriptor_t py_desc, - const mluOpTensorDescriptor_t opt_boundary_desc, - const mluOpTensorDescriptor_t p_desc) { - // check large tensor - if (mluOpGetTensorElementNum(px_desc) >= LARGE_TENSOR_NUM || - mluOpGetTensorElementNum(py_desc) >= LARGE_TENSOR_NUM || - (nullptr != opt_boundary_desc && - mluOpGetTensorElementNum(opt_boundary_desc) >= LARGE_TENSOR_NUM) || - mluOpGetTensorElementNum(p_desc) >= LARGE_TENSOR_NUM) { - LOG(ERROR) << API_NAME << " Overflow max tensor num." - << " Current operator supports tensor num smaller than 2^31."; - return MLUOP_STATUS_NOT_SUPPORTED; - } - - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t checkTensorPtr( - const void *px, const void *py, const void *p, const void *ans, - const mluOpTensorDescriptor_t opt_boundary_desc, const void *opt_boundary, - const int S, const int T, bool &has_boundary) { - if (S > 0) { - PARAM_CHECK(API_NAME, px != nullptr); - } else { - VLOG(5) << API_NAME << " px.shape[1] is zero."; - } - - if (T > 0) { - PARAM_CHECK(API_NAME, py != nullptr); - } else { - VLOG(5) << API_NAME << " py.shape[2] is zero."; - } - - PARAM_CHECK(API_NAME, p != nullptr); - PARAM_CHECK(API_NAME, ans != nullptr); - - if (nullptr != opt_boundary_desc && nullptr != opt_boundary) { - has_boundary = true; - VLOG(5) << API_NAME << " opt_boundary is not NULL."; - } else if (nullptr == opt_boundary_desc && nullptr == opt_boundary) { - has_boundary = false; - VLOG(5) << API_NAME << " opt_boundary is NULL."; - } else { - LOG(ERROR) << API_NAME - << " opt_boundary_desc and opt_boundary must both be NULL, " - << "or both not be NULL."; - return MLUOP_STATUS_BAD_PARAM; - } - - return MLUOP_STATUS_SUCCESS; -} - -static mluOpStatus_t mutualInformationForwardParamCheck( - mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, const void *px, - const mluOpTensorDescriptor_t py_desc, const void *py, - const mluOpTensorDescriptor_t opt_boundary_desc, const void *opt_boundary, - const mluOpTensorDescriptor_t p_desc, const void *p, void *workspace, - const size_t workspace_size, const mluOpTensorDescriptor_t ans_desc, - void *ans, bool &has_boundary, bool &zero_element) { - // 1. check handle and tensor_desc - PARAM_CHECK(API_NAME, handle != nullptr); - PARAM_CHECK(API_NAME, px_desc != nullptr); - PARAM_CHECK(API_NAME, py_desc != nullptr); - PARAM_CHECK(API_NAME, p_desc != nullptr); - PARAM_CHECK(API_NAME, ans_desc != nullptr); - - // since the layout of all tensor is ARRAY, so skip check tensor layout - - // 2. check mlu platform - if (handle->arch < 372) { - LOG(ERROR) << API_NAME << " Only mlu300 and above devices are supported." - << " Please check the device version!"; - return MLUOP_STATUS_ARCH_MISMATCH; - } - - // 3. check tensor dim - mluOpStatus_t check_status = - checkTensorDim(px_desc, py_desc, opt_boundary_desc, p_desc, ans_desc); - if (MLUOP_STATUS_SUCCESS != check_status) { - return check_status; - } - - // 4. check tensor shape - check_status = - checkTensorShape(px_desc, py_desc, opt_boundary_desc, p_desc, ans_desc); - if (MLUOP_STATUS_SUCCESS != check_status) { - return check_status; - } - - // 5. check tensor dtype - check_status = checkTensorDatatype(px_desc, py_desc, opt_boundary_desc, - p_desc, ans_desc); - if (MLUOP_STATUS_SUCCESS != check_status) { - return check_status; - } - - // 6. check scale limit, for large tensor - check_status = checkTensorScaleLimit(handle, px_desc, py_desc, - opt_boundary_desc, p_desc); - if (MLUOP_STATUS_SUCCESS != check_status) { - return check_status; - } - - const int B = px_desc->dims[0]; - const int S = px_desc->dims[1]; - const int T = py_desc->dims[2]; - - // 7. check zero element. - if (0 == B) { - zero_element = true; - VLOG(5) << API_NAME - << " Skip zero element tensor when px.shape[0] is zero."; - return MLUOP_STATUS_SUCCESS; - } - - // 8 check workspace - if (workspace_size > 0) { - PARAM_CHECK(API_NAME, workspace != nullptr); - } - - // 9. check tensor ptr - check_status = checkTensorPtr(px, py, p, ans, opt_boundary_desc, opt_boundary, - S, T, has_boundary); - if (MLUOP_STATUS_SUCCESS != check_status) { - return check_status; - } - - return MLUOP_STATUS_SUCCESS; -} - -static void mutualInformationForwardGencase( - mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, const void *px, - const mluOpTensorDescriptor_t py_desc, const void *py, - const mluOpTensorDescriptor_t opt_boundary_desc, const void *opt_boundary, - const mluOpTensorDescriptor_t p_desc, const void *p, - const mluOpTensorDescriptor_t ans_desc, void *ans) { - GEN_CASE_START("mutual_information_forward"); - GEN_CASE_HANDLE(handle); - - GEN_CASE_DATA(true, "px", px, px_desc, -1, 1); - GEN_CASE_DATA(true, "py", py, py_desc, -1, 1); - if (nullptr != opt_boundary) { - GEN_CASE_DATA_REAL(true, "opt_boundary", opt_boundary, opt_boundary_desc); - } - GEN_CASE_DATA(true, "p", p, p_desc, -1, 1); - GEN_CASE_DATA(false, "p", p, p_desc, -1, 1); - GEN_CASE_DATA(false, "ans", ans, ans_desc, -1, 1); - GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0); -} - -static void policyFunc3Pipeline(const mluOpHandle_t handle, cnrtDim3_t *k_dim, - cnrtFunctionType_t *k_type, int batch_size) { - int core_num = mluop::runtime::getClusterLimitCapability(handle) * - mluop::runtime::getCoreNumOfEachUnionCapability(handle); - *k_type = CNRT_FUNC_TYPE_BLOCK; - k_dim->x = 1; - k_dim->y = batch_size < core_num ? batch_size : core_num; - k_dim->z = 1; -} - -static mluOpStatus_t launchMutualInformationForward3PipelineKernel( - mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, const void *px, - const mluOpTensorDescriptor_t py_desc, const void *py, - const bool has_boundary, const void *opt_boundary, void *p, void *ans) { - const int B = px_desc->dims[0]; - const int S = px_desc->dims[1]; - const int T = py_desc->dims[2]; - - cnrtDim3_t k_dim; - cnrtFunctionType_t k_type; - policyFunc3Pipeline(handle, &k_dim, &k_type, B); - VLOG(5) << "Launch Kernel 3PipelineMutualInformationForward<<>>"; - CHECK_RETURN("[MutualInformationForward]", - kernel3PipelineMutualInformationForward( - k_dim, k_type, handle->queue, B, S, T, px, py, has_boundary, - opt_boundary, p, ans)); - - return MLUOP_STATUS_SUCCESS; -} - -// Calculate computing diagonal number of partition mode for default kernel -static void calComputingDiags(const int S, const int T, - int64_t *computing_diag_num, int *s_block_size, - int *t_block_size, int *s_repeat, int *t_repeat, - int *s_remainder, int *t_remainder, - const int mode) { - // If has remainder part, rearrange block size to balance work load - s_repeat[mode] = S / s_block_size[mode]; - s_remainder[mode] = S % s_block_size[mode]; - if (s_remainder[mode] > 0) { - s_block_size[mode] = S / (s_repeat[mode] + 1); - s_repeat[mode] = S / s_block_size[mode]; - s_remainder[mode] = S % s_block_size[mode]; - } - - t_repeat[mode] = T / t_block_size[mode]; - t_remainder[mode] = T % t_block_size[mode]; - if (t_remainder[mode] > 0) { - t_block_size[mode] = T / (t_repeat[mode] + 1); - t_repeat[mode] = T / t_block_size[mode]; - t_remainder[mode] = T % t_block_size[mode]; - } - - // Accumulate all block's computing diagonal numbers - computing_diag_num[mode] = s_repeat[mode] * t_repeat[mode] * - (s_block_size[mode] + t_block_size[mode] - 1); - if (s_remainder[mode] > 0) { - computing_diag_num[mode] += - t_repeat[mode] * (t_block_size[mode] + s_remainder[mode] - 1); - } - - if (t_remainder[mode] > 0) { - computing_diag_num[mode] += - s_repeat[mode] * (s_block_size[mode] + t_remainder[mode] - 1); - } - - if (s_remainder[mode] > 0 && t_remainder[mode] > 0) { - computing_diag_num[mode] += s_remainder[mode] + t_remainder[mode] - 1; - } -} - -static void assignPartitionParams(const int *s_block_size, - const int *t_block_size, const int *s_repeat, - const int *t_repeat, const int *s_remainder, - const int *t_remainder, - int &final_s_block_size, - int &final_t_block_size, int &final_s_repeat, - int &final_t_repeat, int &final_s_remainder, - int &final_t_remainder, const int mode) { - final_s_block_size = s_block_size[mode]; - final_t_block_size = t_block_size[mode]; - final_s_repeat = s_repeat[mode]; - final_t_repeat = t_repeat[mode]; - final_s_remainder = s_remainder[mode]; - final_t_remainder = t_remainder[mode]; -} - -static void calDefaultPartition(const int S, const int T, const int N_size, - const int nram_size, int &job_diag_num, - int &final_s_block_size, - int &final_t_block_size, int &final_s_repeat, - int &final_t_repeat, int &final_s_remainder, - int &final_t_remainder) { - // Compute each partition's job diagonal number, - // and choose the partition method with the least job diagonal number: - // 1) all S and T, no partition, launch once in one batch; - // 2) S < max_N_size, compare with (S, t) and (S/2, t); - // 3) T < max_N_size, compare with (s, T) and (s, T/2); - // 4) both S and T > max_N_size, compare with (N, N), (S, t), (s, T), if - // exist; - if (S <= N_size && T <= N_size) { - // once can compute all SxT onchip - job_diag_num = 1; - final_s_block_size = S; - final_t_block_size = T; - final_s_repeat = 1; - final_t_repeat = 1; - final_s_remainder = 0; - final_t_remainder = 0; - return; - } else { - // Sum of each partition's number of computing diagonals - // at most 3 arrays of candidate partition mode - int mode; - int64_t computing_diag_num[3] = {0}; - int s_block_size[3] = {0}; - int t_block_size[3] = {0}; - int s_repeat[3] = {0}; - int t_repeat[3] = {0}; - int s_remainder[3] = {0}; - int t_remainder[3] = {0}; - - if (S <= N_size && T > N_size) { - // compare with (S, t) and (S/2, t) - // 1) deal_s = S; min(s, t) = s; - mode = 0; - s_block_size[0] = S; - t_block_size[0] = (nram_size / sizeof(float) - 7 * s_block_size[0]) / - (3 * s_block_size[0] + 1); - calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size, - s_repeat, t_repeat, s_remainder, t_remainder, mode); - // 2) deal_s = S/2; min(s, t) = s; - mode = 1; - s_block_size[1] = std::max(S / 2, 1); // at least 1 number in s_block - t_block_size[1] = (nram_size / sizeof(float) - 7 * s_block_size[1]) / - (3 * s_block_size[1] + 1); - calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size, - s_repeat, t_repeat, s_remainder, t_remainder, mode); - - if (computing_diag_num[0] <= computing_diag_num[1]) { - assignPartitionParams( - s_block_size, t_block_size, s_repeat, t_repeat, s_remainder, - t_remainder, final_s_block_size, final_t_block_size, final_s_repeat, - final_t_repeat, final_s_remainder, final_t_remainder, 0); - } else { - assignPartitionParams( - s_block_size, t_block_size, s_repeat, t_repeat, s_remainder, - t_remainder, final_s_block_size, final_t_block_size, final_s_repeat, - final_t_repeat, final_s_remainder, final_t_remainder, 1); - } - } else if (S > N_size && T <= N_size) { - // compare with (s, T) and (s, T/2) - // 1) deal_t = T; min(s, t) = t; - mode = 0; - t_block_size[0] = T; - s_block_size[0] = (nram_size / sizeof(float) - 7 * t_block_size[0]) / - (3 * t_block_size[0] + 1); - calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size, - s_repeat, t_repeat, s_remainder, t_remainder, mode); - // 2) deal_t = T/2; min(s, t) = t; - mode = 1; - t_block_size[1] = std::max(T / 2, 1); // at least 1 number in t_block - s_block_size[1] = (nram_size / sizeof(float) - 7 * t_block_size[1]) / - (3 * t_block_size[1] + 1); - calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size, - s_repeat, t_repeat, s_remainder, t_remainder, mode); - - if (computing_diag_num[0] <= computing_diag_num[1]) { - assignPartitionParams( - s_block_size, t_block_size, s_repeat, t_repeat, s_remainder, - t_remainder, final_s_block_size, final_t_block_size, final_s_repeat, - final_t_repeat, final_s_remainder, final_t_remainder, 0); - } else { - assignPartitionParams( - s_block_size, t_block_size, s_repeat, t_repeat, s_remainder, - t_remainder, final_s_block_size, final_t_block_size, final_s_repeat, - final_t_repeat, final_s_remainder, final_t_remainder, 1); - } - } else { // S > N_size, T > N_size, choose between (N,N), (S,t), (s,T) - // 1) deal_s = deal_t = N_size; min(s,t) = s = t; - mode = 0; - s_block_size[0] = N_size; - t_block_size[0] = N_size; - calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size, - s_repeat, t_repeat, s_remainder, t_remainder, mode); - // 2) deal_s = S, deal_t = t; min(s,t) = t; - mode = 1; - s_block_size[1] = N_size; - t_block_size[1] = (nram_size / sizeof(float) - 1 * s_block_size[1]) / - (3 * s_block_size[1] + 7); - if (t_block_size[1] > 0) { - calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size, - s_repeat, t_repeat, s_remainder, t_remainder, mode); - } else { - computing_diag_num[1] = -1; // not support on this partition - } - // 3) deal_t = T, deal_s = s; min(s,t) = s; - mode = 2; - t_block_size[2] = T; - s_block_size[2] = (nram_size / sizeof(float) - 1 * t_block_size[2]) / - (3 * t_block_size[2] + 7); - if (s_block_size[2] > 0) { - calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size, - s_repeat, t_repeat, s_remainder, t_remainder, mode); - } else { - computing_diag_num[2] = -1; // not support on this partition - } - - if (computing_diag_num[0] > 0 && // mode 0 is valid - ((computing_diag_num[1] <= 0) || // mode 1 is invalid or - computing_diag_num[0] <= - computing_diag_num[1])) { // mode 0 is better than mode 1 - if (computing_diag_num[2] > 0 && // mode 2 is valid and - computing_diag_num[2] < - computing_diag_num[0]) { // mode 2 is better than mode 0 - // choose mode 2 - assignPartitionParams(s_block_size, t_block_size, s_repeat, t_repeat, - s_remainder, t_remainder, final_s_block_size, - final_t_block_size, final_s_repeat, - final_t_repeat, final_s_remainder, - final_t_remainder, 2); - } else { - // choose mode 0 - assignPartitionParams(s_block_size, t_block_size, s_repeat, t_repeat, - s_remainder, t_remainder, final_s_block_size, - final_t_block_size, final_s_repeat, - final_t_repeat, final_s_remainder, - final_t_remainder, 0); - } - } else { // mode 1 is valid and mode 1 is better than mode 0 - if (computing_diag_num[2] > 0 && // mode 2 is valid - computing_diag_num[2] < - computing_diag_num[1]) { // mode 2 is better than mode 1 - // choose mode 2 - assignPartitionParams(s_block_size, t_block_size, s_repeat, t_repeat, - s_remainder, t_remainder, final_s_block_size, - final_t_block_size, final_s_repeat, - final_t_repeat, final_s_remainder, - final_t_remainder, 2); - } else { - // choose mode 1 - assignPartitionParams(s_block_size, t_block_size, s_repeat, t_repeat, - s_remainder, t_remainder, final_s_block_size, - final_t_block_size, final_s_repeat, - final_t_repeat, final_s_remainder, - final_t_remainder, 1); - } - } - } - // total job diagonal number in parallel - job_diag_num = final_s_repeat + (int)(final_s_remainder > 0) + - final_t_repeat + (int)(final_t_remainder > 0) - 1; - } -} - -static mluOpStatus_t launchMutualInformationForwardDefaultKernel( - mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, const void *px, - const mluOpTensorDescriptor_t py_desc, const void *py, - const bool has_boundary, const void *opt_boundary, void *p, void *ans) { - // When S and T is too large, launch default kernel with partition of S and T - // 1. Compute current arch max N size, according to NRAM size and device RAM - // 2. Use max_N_size to calculate different partition mode computing diagonal - // numbers and choose the partition mode, which has the least computing - // diagonal number - // 3. Launch default kernels by diagonal in parallel, with check of MaxDimX - - const int B = px_desc->dims[0]; - const int S = px_desc->dims[1]; - const int T = py_desc->dims[2]; - - cnrtDim3_t k_dim; - cnrtFunctionType_t k_type; - // 1. According to on-chip RAM size, calculate current arch partition block - // size by square, Use max_N_size to partition on S and T dimension RAM space: - // (S+1)*(T+1) + S*T + S*T + 3*min(S,T) + 3*min(S,T)+1 - int max_N_size = (int)(std::sqrt(handle->nram_size / sizeof(float) / 3)) - 2; - // Use max square size N, partition on T and S dimension, launch by diagonal: - // -|------T--------| - // :| N1| N2| N3| N4| - // :|---|---|---|---| - // S| N2| N3| N4| N5| - // :|---|---|---|---| - // :| N3| N4| N5| N6| - // -|---------------| - - VLOG(5) << "Current arch Max square N size is " << max_N_size; - - int job_diag_num; // number of default kernel launch steps by diagonal - int s_block_size, t_block_size, s_repeat, t_repeat, s_remainder, t_remainder; - - // 2. Choose the partition mode, which has the least computing diagonal number - // NOTE: p has dimension (S+1, T+1), in function directly use (S, T) instead - calDefaultPartition(S + 1, T + 1, max_N_size, handle->nram_size, job_diag_num, - s_block_size, t_block_size, s_repeat, t_repeat, - s_remainder, t_remainder); - int s_block_num = s_repeat + (int)(s_remainder > 0); - int t_block_num = t_repeat + (int)(t_remainder > 0); - int max_s_t_block_num = std::max(s_block_num, t_block_num); - int min_s_t_block_num = std::min(s_block_num, t_block_num); - - k_type = CNRT_FUNC_TYPE_BLOCK; - k_dim.y = 1; - k_dim.z = 1; - // Get current arch support max dim_x value - int task_dim_x_limit; - cnDeviceGetAttribute(&task_dim_x_limit, - CN_DEVICE_ATTRIBUTE_MAX_BLOCK_TASK_DIM_X, - handle->device); - VLOG(5) << "Current arch MAX_BLOCK_TASK_DIM_X is " << task_dim_x_limit; - - // 3. Traverse step_i from 0 to (job_diag_num - 1) - for (int step_i = 0; step_i < job_diag_num; step_i++) { - int job_num_on_step = B * (step_i < max_s_t_block_num - ? std::min(step_i + 1, min_s_t_block_num) - : s_block_num + t_block_num - step_i - 1); - k_dim.x = job_num_on_step; - // Make sure not exceed max dim x limit - if (k_dim.x > task_dim_x_limit) { - int task_dim_change = (k_dim.x + task_dim_x_limit - 1) / task_dim_x_limit; - k_dim.x = (k_dim.x + task_dim_x_limit - 1) / task_dim_change; - k_dim.y = k_dim.y * task_dim_change; - } - - VLOG(5) << "Launch Kernel DefaultMutualInformationForward<<< step " - << step_i << " of Batch Block: " << k_dim.x << ", " << k_dim.y - << ", " << k_dim.z << ">>>"; - CHECK_RETURN("[MutualInformationForward]", - kernelDefaultMutualInformationForward( - k_dim, k_type, handle->queue, B, S, T, step_i, - job_num_on_step, s_block_num, t_block_num, s_block_size, - t_block_size, px, py, has_boundary, opt_boundary, p, ans)); - } - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpMutualInformationForward( - mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, const void *px, - const mluOpTensorDescriptor_t py_desc, const void *py, - const mluOpTensorDescriptor_t opt_boundary_desc, const void *opt_boundary, - const mluOpTensorDescriptor_t p_desc, void *p, void *workspace, - const size_t workspace_size, const mluOpTensorDescriptor_t ans_desc, - void *ans) { - // 1. Paramcheck - bool has_boundary = false; - bool zero_element = false; - mluOpStatus_t check_status = mutualInformationForwardParamCheck( - handle, px_desc, px, py_desc, py, opt_boundary_desc, opt_boundary, p_desc, - p, workspace, workspace_size, ans_desc, ans, has_boundary, zero_element); - - if (MLUOP_STATUS_SUCCESS != check_status || zero_element) { - return check_status; - } - - // 2. Generate case - if (MLUOP_GEN_CASE_ON_NEW) { - mutualInformationForwardGencase(handle, px_desc, px, py_desc, py, - opt_boundary_desc, opt_boundary, p_desc, p, - ans_desc, ans); - } - - // Choose to launch 3pipeline kernel or default kernel - const int S = px_desc->dims[1]; - const int T = py_desc->dims[2]; - bool is_launch_3pipeline = true; - - // Check 3pipeline kernel scale limit for computing p - // 9: max_val, mask, temp, ping(py, px, p) and pong(py, px, p) - // 11: max_val, mask, temp, ping(py, px, p), pong(py, px, p) and 2*(-inf) - int current_size = - T * (S + 1) + (T + 1) * S + (T + 1) * (S + 1) + 9 * std::min(S, T) + 11; - if (current_size > handle->nram_size / sizeof(float)) { - is_launch_3pipeline = false; - } - - // 3. Launch kernel - mluOpStatus_t return_status; - if (is_launch_3pipeline) { - // launch 3pipeline kernel when satisfy scale limit - return_status = launchMutualInformationForward3PipelineKernel( - handle, px_desc, px, py_desc, py, has_boundary, opt_boundary, p, ans); - } else { - // launch default kernel - return_status = launchMutualInformationForwardDefaultKernel( - handle, px_desc, px, py_desc, py, has_boundary, opt_boundary, p, ans); - } - - GEN_CASE_END(); - return return_status; -} diff --git a/kernels/mutual_information_forward/mutual_information_forward.h b/kernels/mutual_information_forward/mutual_information_forward.h deleted file mode 100644 index 42df0dc9b..000000000 --- a/kernels/mutual_information_forward/mutual_information_forward.h +++ /dev/null @@ -1,41 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#ifndef KERNELS_MUTUAL_INFORMATION_FORWARD_MUTUAL_INFORMATION_FORWARD_H_ -#define KERNELS_MUTUAL_INFORMATION_FORWARD_MUTUAL_INFORMATION_FORWARD_H_ - -#include "mlu_op.h" -#include "kernels/kernel.h" - -mluOpStatus_t MLUOP_WIN_API kernel3PipelineMutualInformationForward( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, const int B, - const int S, const int T, const void *px, const void *py, - const bool has_boundary, const void *opt_boundary, void *p, void *ans); - -mluOpStatus_t MLUOP_WIN_API kernelDefaultMutualInformationForward( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, const int B, - const int S, const int T, const int step_i, const int job_num_on_step, - const int s_block_num, const int t_block_num, const int s_block_size, - const int t_block_size, const void *px, const void *py, - const bool has_boundary, const void *opt_boundary, void *p, void *ans); - -#endif // KERNELS_MUTUAL_INFORMATION_FORWARD_MUTUAL_INFORMATION_FORWARD_H_ diff --git a/kernels/mutual_information_forward/mutual_information_forward_3pipeline_block.mlu b/kernels/mutual_information_forward/mutual_information_forward_3pipeline_block.mlu deleted file mode 100644 index f3075cb32..000000000 --- a/kernels/mutual_information_forward/mutual_information_forward_3pipeline_block.mlu +++ /dev/null @@ -1,227 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "mutual_information_forward.h" - -#include "core/logging.h" -#include "kernels/kernel.h" -#include "kernels/utils/common.h" -#include "kernels/mutual_information_forward/mutual_information_forward_utils.h" - -__mlu_func__ void pipelineLoad(float *nram_px, float *nram_py, const int T, - const int s_len, const int t_len, - const int max_len, const int min_len, - const int s_begin, const int t_begin, - const int t_end, const int diagonal_num, - const int i, float *ping) { - int data_num = i < max_len ? __mluop_min(i + 1, min_len) : diagonal_num - i; - int s = i - 1 < t_len ? s_begin : i - t_len + s_begin; - int t = i - 1 < t_len ? i + t_begin - 1 : t_end; - int px_num = i < t_len ? data_num - 1 : data_num; - if (px_num > 0) { - __memcpy_async(ping + min_len + 1, nram_px + s * (T + 1) + t, sizeof(float), - NRAM2NRAM, sizeof(float), T * sizeof(float), px_num - 1); - } - - int py_num = i < s_len ? data_num - 1 : data_num; - if (i >= t_len) { - s += 1; - t -= 1; - } - - if (py_num > 0) { - __memcpy_async(ping, nram_py + s * T + t, sizeof(float), NRAM2NRAM, - sizeof(float), (T - 1) * sizeof(float), py_num - 1); - } -} - -__mlu_func__ void pipelineCompute(const int s_len, const int t_len, - const int max_len, const int min_len, - const int diagonal_num, const int i, - float *max_value, float *mask, float *temp, - float *pong_p, float *ping) { - float *ping_py = ping; - float *ping_px = ping_py + min_len + 1; - float *ping_p = ping_px + min_len; - - int data_num = i < max_len ? __mluop_min(i + 1, min_len) : diagonal_num - i; - - int px_num = i < t_len ? data_num - 1 : data_num; - if (px_num > 0) { - __bang_add(ping_px, ping_px, pong_p, px_num); - } - - int py_num = i < s_len ? data_num - 1 : data_num; - if (i >= t_len) { - pong_p += 1; - } - if (py_num > 0) { - __bang_add(ping_py, ping_py, pong_p, py_num); - } - - ping_px = i < t_len ? ping_px - 1 : ping_px; - logAddVector(ping_p, ping_px, ping_py, max_value, mask, temp, data_num); - - __bang_write_value(ping, 2 * min_len + 1, -INFINITY); -} - -__mlu_func__ void pipelineStore(float *nram_p, const int T, const int t_len, - const int max_len, const int min_len, - const int s_begin, const int t_begin, - const int t_end, const int diagonal_num, - const int i, float *ping_p) { - int data_num = i < max_len ? __mluop_min(i + 1, min_len) : diagonal_num - i; - int s = i < t_len ? s_begin : i - t_len + 1 + s_begin; - int t = i < t_len ? i + t_begin : t_end; - __memcpy_async(nram_p + s * (T + 1) + t, ping_p, sizeof(float), NRAM2NRAM, - T * sizeof(float), sizeof(float), data_num - 1); -} - -__mlu_func__ void compute3PipelineMutualInformation( - const int b, const int S, const int T, const bool has_boundary, - const int s_begin, const int s_end, const int t_begin, const int t_end, - const float *px, const float *py, float *p, float *ans) { - /* *********************nram space split********************** */ - /* |--------------------------COMMON-------------------------| */ - /* | px | py | p | max_val | mask | temp | */ - /* |S*(T+1)|(S+1)*T|(S+1)*(T+1)| min_len | min_len | min_len | */ - /* |------------PING------------|------------PONG------------| */ - /* | cur_py|-inf|cur_px | cur_p | cur_py|-inf|cur_px | cur_p | */ - /* |min_len| 1 |min_len|min_len|min_len| 1 |min_len|min_len| */ - const int px_one_batch_size = S * (T + 1); - const int py_one_batch_size = (S + 1) * T; - const int p_one_batch_size = (S + 1) * (T + 1); - - float *nram_px = (float *)nram_buffer; - float *nram_py = nram_px + px_one_batch_size; - float *nram_p = nram_py + py_one_batch_size; - - if (S > 0) { - __memcpy(nram_px, px + b * px_one_batch_size, - px_one_batch_size * sizeof(float), GDRAM2NRAM); - } - - if (T > 0) { - __memcpy(nram_py, py + b * py_one_batch_size, - py_one_batch_size * sizeof(float), GDRAM2NRAM); - } - - if (has_boundary) { - __memcpy(nram_p, p + b * p_one_batch_size, p_one_batch_size * sizeof(float), - GDRAM2NRAM); - } - - const int s_len = s_end - s_begin + 1; - const int t_len = t_end - t_begin + 1; - const int max_len = __mluop_max(s_len, t_len); - const int min_len = __mluop_min(s_len, t_len); - const int ping_pong_gap = 3 * min_len + 1; - - float *nram_max_value = nram_p + p_one_batch_size; - float *nram_mask = nram_max_value + min_len; - float *nram_temp = nram_mask + min_len; - - float *ping = nram_temp + min_len; - float *ping_p = ping + 2 * min_len + 1; - - __bang_write_value(ping, ping_pong_gap * 2, -INFINITY); - - nram_p[s_begin * (T + 1) + t_begin] = (float)0; - ping_p[ping_pong_gap] = (float)0; - - __sync(); - - int repeat = s_len + t_len - 2; - for (int i = 0; i < repeat + 2; ++i) { - if (i < repeat) { - pipelineLoad(nram_px, nram_py, T, s_len, t_len, max_len, min_len, s_begin, - t_begin, t_end, repeat + 1, i + 1, - ping + (i % 2) * ping_pong_gap); - } - - if (i > 0 && i <= repeat) { - pipelineCompute(s_len, t_len, max_len, min_len, repeat + 1, i, - nram_max_value, nram_mask, nram_temp, - ping_p + (i % 2) * ping_pong_gap, - ping + ((i - 1) % 2) * ping_pong_gap); - } - - if (i > 1) { - pipelineStore(nram_p, T, t_len, max_len, min_len, s_begin, t_begin, t_end, - repeat + 1, i - 1, ping_p + (i % 2) * ping_pong_gap); - } - __sync(); - } - - __memcpy(ans + b, nram_p + s_end * (T + 1) + t_end, sizeof(float), - NRAM2GDRAM); - __memcpy(p + b * p_one_batch_size, nram_p, p_one_batch_size * sizeof(float), - NRAM2GDRAM); -} - -__mlu_global__ void mluBlock3PipelineMutualInformationForward( - const int B, const int S, const int T, const float *px, const float *py, - const bool has_boundary, const int64_t *opt_boundary, float *p, - float *ans) { - const int num_per_core = B / taskDim; - const int num_rem = B % taskDim; - const int num_cur_core = num_per_core + (taskId < num_rem); - const int b_offset = taskId * num_cur_core + (taskId >= num_rem) * num_rem; - - int s_begin = 0; - int t_begin = 0; - int s_end = S; - int t_end = T; - if (has_boundary) { - int64_t *boundary = (int64_t *)nram_buffer; - for (int b = b_offset; b < b_offset + num_cur_core; ++b) { - __memcpy(boundary, opt_boundary + 4 * b, 4 * sizeof(int64_t), GDRAM2NRAM); - s_begin = boundary[0]; - t_begin = boundary[1]; - s_end = boundary[2]; - t_end = boundary[3]; - - if (s_begin > s_end || t_begin > t_end) { - ans[b] = 0.0; - continue; - } - compute3PipelineMutualInformation(b, S, T, has_boundary, s_begin, s_end, - t_begin, t_end, px, py, p, ans); - } - } else { - for (int b = b_offset; b < b_offset + num_cur_core; ++b) { - compute3PipelineMutualInformation(b, S, T, has_boundary, s_begin, s_end, - t_begin, t_end, px, py, p, ans); - } - } -} - -mluOpStatus_t MLUOP_WIN_API kernel3PipelineMutualInformationForward( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, const int B, - const int S, const int T, const void *px, const void *py, - const bool has_boundary, const void *opt_boundary, void *p, void *ans) { - KERNEL_CHECK( - mluBlock3PipelineMutualInformationForward<<>>( - B, S, T, (float *)px, (float *)py, has_boundary, - (int64_t *)opt_boundary, (float *)p, (float *)ans)); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/mutual_information_forward/mutual_information_forward_default_block.mlu b/kernels/mutual_information_forward/mutual_information_forward_default_block.mlu deleted file mode 100644 index f66fc93c8..000000000 --- a/kernels/mutual_information_forward/mutual_information_forward_default_block.mlu +++ /dev/null @@ -1,307 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "mutual_information_forward.h" - -#include "core/logging.h" -#include "kernels/kernel.h" -#include "kernels/utils/common.h" -#include "kernels/mutual_information_forward/mutual_information_forward_utils.h" - -__mlu_func__ bool calPartitionJobScope( - bool has_boundary, const int64_t *opt_boundary, const int B, const int S, - const int T, const int step_i, const int job_num_on_step, - const int s_block_num, const int t_block_num, const int s_block_size, - const int t_block_size, int &batch_idx, int &batch_s_begin, - int &batch_t_begin, int &batch_s_end, int &batch_t_end, int &cur_s_begin, - int &cur_t_begin, int &cur_s_end, int &cur_t_end, int &cur_s_size, - int &cur_t_size, bool &need_compute_ans, float *ans) { - int job_num_on_batch = job_num_on_step / B; // Each batch job num - batch_idx = taskId / job_num_on_batch; // Current job on which batch - int block_id_in_batch = - taskId - batch_idx * job_num_on_batch; // Current job id in batch - - // taskDim is not always job num, because of task dim x limit - if (batch_idx >= B) { - return true; - } - - // Compute s and t block id in batch - int s_block_id, t_block_id; - s_block_id = __mluop_max(0, step_i - (t_block_num - 1)) + block_id_in_batch; - t_block_id = __mluop_min(step_i, t_block_num - 1) - block_id_in_batch; - - // Compute current job id scope - cur_s_begin = s_block_id * s_block_size; - cur_t_begin = t_block_id * t_block_size; - cur_s_end = (s_block_id + 1) * s_block_size - 1; - cur_t_end = (t_block_id + 1) * t_block_size - 1; - - // Deal with boundary and decide current job if need to compute - if (has_boundary) { - int64_t *boundary = (int64_t *)nram_buffer; - __memcpy(boundary, opt_boundary + 4 * batch_idx, 4 * sizeof(int64_t), - GDRAM2NRAM); - batch_s_begin = boundary[0]; - batch_t_begin = boundary[1]; - batch_s_end = boundary[2]; - batch_t_end = boundary[3]; - // invalid boundary, first launch step set ans to 0 - if (step_i == 0 && - (batch_s_begin > batch_s_end || batch_t_begin > batch_t_end)) { - ans[batch_idx] = 0; - return true; - } - } - - // Compare current job scope with batch scope, if empty job, return - if (cur_s_begin > batch_s_end || cur_t_begin > batch_t_end || - cur_s_end < batch_s_begin || cur_t_end < batch_t_begin) { - return true; - } - - // Reset s and t begin and end to valid boundary - if (cur_s_begin < batch_s_begin) { - cur_s_begin = batch_s_begin; - } - if (cur_t_begin < batch_t_begin) { - cur_t_begin = batch_t_begin; - } - if (cur_s_end > batch_s_end) { - cur_s_end = batch_s_end; - } - if (cur_t_end > batch_t_end) { - cur_t_end = batch_t_end; - } - - cur_s_size = cur_s_end - cur_s_begin + 1; - cur_t_size = cur_t_end - cur_t_begin + 1; - - // At last compute step, need compute ans - if (cur_s_end == batch_s_end && cur_t_end == batch_t_end) { - need_compute_ans = true; - } else { - need_compute_ans = false; - } - - return false; -} - -__mlu_func__ void loadInitP(const float *gdram_px, const float *gdram_py, - const float *gdram_p, float *nram_px, - float *nram_py, float *nram_p, const int S, - const int T, const int batch_s_begin, - const int batch_t_begin, const int cur_s_begin, - const int cur_t_begin, const int cur_s_size, - const int cur_t_size) { - // Compare current s_begin and batch_s_begin to decide load px or write -inf - if (cur_s_begin > batch_s_begin) { - // Load px(s-1, t) - __memcpy_async( - nram_px, gdram_px + (cur_s_begin - 1) * (T + 1) + cur_t_begin, - cur_t_size * sizeof(float), GDRAM2NRAM, cur_t_size * sizeof(float), - (T + 1) * sizeof(float), cur_s_size - 1); - // Load p(s-1, t), one row - __memcpy_async(nram_p + 1, - gdram_p + (cur_s_begin - 1) * (T + 1) + cur_t_begin, - cur_t_size * sizeof(float), GDRAM2NRAM, 0, 0, 0); - } else { // cur_s_begin == batch_s_begin, skip first row, and write -inf - if (cur_s_size > 1) { - __memcpy_async( - nram_px + cur_t_size, gdram_px + cur_s_begin * (T + 1) + cur_t_begin, - cur_t_size * sizeof(float), GDRAM2NRAM, cur_t_size * sizeof(float), - (T + 1) * sizeof(float), cur_s_size - 2); - } - __nramset_async(nram_px, cur_t_size, (float)(-INFINITY), 0, 0); - // p(s-1, t) first row write -inf - __nramset_async(nram_p + 1, cur_t_size, (float)(-INFINITY), 0, 0); - } - - // Compare current t_begin and batch_t_begin to decide load py or write -inf - if (cur_t_begin > batch_t_begin) { - // Load py(s, t-1) - __memcpy_async(nram_py, gdram_py + cur_s_begin * T + cur_t_begin - 1, - cur_t_size * sizeof(float), GDRAM2NRAM, - cur_t_size * sizeof(float), T * sizeof(float), - cur_s_size - 1); - // Load p(s, t-1) - __memcpy_async(nram_p + cur_t_size + 1, - gdram_p + cur_s_begin * (T + 1) + cur_t_begin - 1, - sizeof(float), GDRAM2NRAM, (cur_t_size + 1) * sizeof(float), - (T + 1) * sizeof(float), cur_s_size - 1); - } else { // cur_t_begin == batch_t_begin, skip first column, and write -inf - if (cur_t_size > 1) { - __memcpy_async(nram_py + 1, gdram_py + cur_s_begin * T + cur_t_begin, - (cur_t_size - 1) * sizeof(float), GDRAM2NRAM, - cur_t_size * sizeof(float), T * sizeof(float), - cur_s_size - 1); - } - __nramset_async(nram_py, 1, (float)(-INFINITY), cur_t_size * sizeof(float), - cur_s_size - 1); - // p(s, t-1) first column write -inf - __nramset_async(nram_p + cur_t_size + 1, 1, (float)(-INFINITY), - (cur_t_size + 1) * sizeof(float), cur_s_size - 1); - } - - // sync for memcpy async - __sync(); -} - -__mlu_func__ void computePByDiagonal( - float *nram_px, float *nram_py, float *nram_p, float *nram_cur_px, - float *nram_cur_py, float *nram_cur_p, float *max_val, float *mask, - float *temp, const int batch_s_begin, const int batch_t_begin, - const int cur_s_begin, const int cur_t_begin, const int cur_s_size, - const int cur_t_size) { - // Compute P by diagonal - const int repeat = cur_s_size + cur_t_size - 1; - const int max_s_t = __mluop_max(cur_s_size, cur_t_size); - const int min_s_t = __mluop_min(cur_s_size, cur_t_size); - - for (int i = 0; i < repeat; ++i) { - // Initialize p(batch_s_begin, batch_t_begin) to 0 - if (cur_s_begin == batch_s_begin && cur_t_begin == batch_t_begin && - i == 0) { - nram_p[cur_t_size + 2] = 0.0; - continue; - } - - int data_num = i < max_s_t ? __mluop_min(i + 1, min_s_t) - : cur_s_size + cur_t_size - i - 1; - - // px, py use same s, t index on nram, - // different -1 offset of row and column is considered when load - int first_s = __mluop_max(0, i - (cur_t_size - 1)); - int first_t = __mluop_min(i, cur_t_size - 1); - - // Move p(s-1, t), p(s, t-1), px(s-1, t), py(s, t-1) - __memcpy(nram_cur_p, nram_p + first_s * (cur_t_size + 1) + first_t + 1, - sizeof(float), NRAM2NRAM, sizeof(float), - cur_t_size * sizeof(float), data_num); - __memcpy(nram_cur_px, nram_px + first_s * cur_t_size + first_t, - sizeof(float), NRAM2NRAM, sizeof(float), - (cur_t_size - 1) * sizeof(float), data_num - 1); - __memcpy(nram_cur_py, nram_py + first_s * cur_t_size + first_t, - sizeof(float), NRAM2NRAM, sizeof(float), - (cur_t_size - 1) * sizeof(float), data_num - 1); - - // Compute current p - __bang_add(nram_cur_px, nram_cur_px, nram_cur_p, data_num); - __bang_add(nram_cur_py, nram_cur_py, nram_cur_p + 1, data_num); - logAddVector(nram_cur_p, nram_cur_px, nram_cur_py, max_val, mask, temp, - data_num); - - // Move p back - __memcpy(nram_p + (first_s + 1) * (cur_t_size + 1) + first_t + 1, - nram_cur_p, sizeof(float), NRAM2NRAM, cur_t_size * sizeof(float), - sizeof(float), data_num - 1); - } -} - -__mlu_global__ void mluBlockDefaultMutualInformationForward( - const int B, const int S, const int T, const int step_i, - const int job_num_on_step, const int s_block_num, const int t_block_num, - const int s_block_size, const int t_block_size, const float *px, - const float *py, const bool has_boundary, const int64_t *opt_boundary, - float *p, float *ans) { - /************************* NRAM SPACE *******************************/ - /*|----------------------------------------------------------------|*/ - /*| px, py | p |max_val,mask,temp|cur_px |cur_py | cur_p |*/ - /*| 2*S*T |(S+1)*(T+1)| 3 * min_len |min_len|min_len|min_len+1|*/ - /*|----------------------------------------------------------------|*/ - - // NOTE: s and t block size has already + 1 on S and T - int min_s_t_block_size = __mluop_min(s_block_size, t_block_size); - float *nram_px = (float *)nram_buffer; - float *nram_py = nram_px + s_block_size * t_block_size; - float *nram_p = nram_py + s_block_size * t_block_size; - float *nram_max_val = nram_p + (s_block_size + 1) * (t_block_size + 1); - float *nram_mask = nram_max_val + min_s_t_block_size; - float *nram_temp = nram_mask + min_s_t_block_size; - float *nram_cur_px = nram_temp + min_s_t_block_size; - float *nram_cur_py = nram_cur_px + min_s_t_block_size; - float *nram_cur_p = nram_cur_py + min_s_t_block_size; - - int batch_idx; - int batch_s_begin = 0; - int batch_t_begin = 0; - int batch_s_end = S; - int batch_t_end = T; - int cur_s_begin, cur_t_begin, cur_s_end, cur_t_end, cur_s_size, cur_t_size; - bool need_compute_ans; - - // According to has_boundary, calculate current job scope - bool need_return = calPartitionJobScope( - has_boundary, opt_boundary, B, S, T, step_i, job_num_on_step, s_block_num, - t_block_num, s_block_size, t_block_size, batch_idx, batch_s_begin, - batch_t_begin, batch_s_end, batch_t_end, cur_s_begin, cur_t_begin, - cur_s_end, cur_t_end, cur_s_size, cur_t_size, need_compute_ans, ans); - // Because taskDimX could change to taskDimY, so not all jobs need to compute - if (need_return) { - return; - } - - const int px_one_batch_num = S * (T + 1); - const int py_one_batch_num = (S + 1) * T; - const int p_one_batch_num = (S + 1) * (T + 1); - - const float *gdram_px = px + batch_idx * px_one_batch_num; - const float *gdram_py = py + batch_idx * py_one_batch_num; - float *gdram_p = p + batch_idx * p_one_batch_num; - - // LoadInitP, load px, py, other block p, or write -inf at first row/column - loadInitP(gdram_px, gdram_py, gdram_p, nram_px, nram_py, nram_p, S, T, - batch_s_begin, batch_t_begin, cur_s_begin, cur_t_begin, cur_s_size, - cur_t_size); - - // ComputeP by diagonal - // p[b,s,t] = log_add(p[b,s-1,t] + px[b,s-1,t], p[b,s,t-1] + py[b,s,t-1]) - computePByDiagonal(nram_px, nram_py, nram_p, nram_cur_px, nram_cur_py, - nram_cur_p, nram_max_val, nram_mask, nram_temp, - batch_s_begin, batch_t_begin, cur_s_begin, cur_t_begin, - cur_s_size, cur_t_size); - - // StoreP - __memcpy(gdram_p + cur_s_begin * (T + 1) + cur_t_begin, - nram_p + cur_t_size + 2, cur_t_size * sizeof(float), NRAM2GDRAM, - (T + 1) * sizeof(float), (cur_t_size + 1) * sizeof(float), - cur_s_size - 1); - - // If last compute step, need store p[s_end, t_end] to ans - if (need_compute_ans) { - ans[batch_idx] = nram_p[cur_s_size * (cur_t_size + 1) + cur_t_size]; - } -} - -mluOpStatus_t MLUOP_WIN_API kernelDefaultMutualInformationForward( - cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, const int B, - const int S, const int T, const int step_i, const int job_num_on_step, - const int s_block_num, const int t_block_num, const int s_block_size, - const int t_block_size, const void *px, const void *py, - const bool has_boundary, const void *opt_boundary, void *p, void *ans) { - KERNEL_CHECK( - mluBlockDefaultMutualInformationForward<<>>( - B, S, T, step_i, job_num_on_step, s_block_num, t_block_num, - s_block_size, t_block_size, (float *)px, (float *)py, has_boundary, - (int64_t *)opt_boundary, (float *)p, (float *)ans)); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/mutual_information_forward/mutual_information_forward_utils.h b/kernels/mutual_information_forward/mutual_information_forward_utils.h deleted file mode 100644 index fc743a95b..000000000 --- a/kernels/mutual_information_forward/mutual_information_forward_utils.h +++ /dev/null @@ -1,73 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#ifndef KERNELS_MUTUAL_INFORMATION_FORWARD_MUTUAL_INFORMATION_FORWARD_UTILS_H_ -#define KERNELS_MUTUAL_INFORMATION_FORWARD_MUTUAL_INFORMATION_FORWARD_UTILS_H_ - -#include "mlu_op.h" - -#define MIN_LOG_DIFF_FLOAT -15.9423847198486328125f - -__nram__ char nram_buffer[MAX_NRAM_SIZE]; - -__mlu_func__ void logAddVector(float *dst, float *src1, float *src2, - float *max_value, float *mask, float *temp, - int data_num) { - __bang_nan_minimum(dst, src1, src2, data_num); - __bang_maximum(max_value, src1, src2, data_num); - - // If src1 is nan, then max_value = src1 = nan - // use band with exp and mantissa bits, then compare ge with 0x7f800001 - __asm__ volatile( - "fuse.nram.s32 [%[dst]], %[size], [%[src0]]," - ".and(%[src1]), .ge(%[src2]), .mul(%[src3])," - ".and([%[src4]]);\n" ::[dst] "r"((int32_t *)temp), - [ size ] "r"(data_num), [ src0 ] "r"((int32_t *)src1), - [ src1 ] "r"(0x7fffffff), [ src2 ] "r"(0x7f800001), [ src3 ] "r"(-1), - [ src4 ] "r"((int32_t *)src1)); - __bang_add(max_value, max_value, temp, data_num); - - // Compute log sum exp: max_value + log1p(exp(min_value - max_value)) - __bang_sub(dst, dst, max_value, data_num); // min_value - max_value - __bang_ge_scalar(mask, dst, MIN_LOG_DIFF_FLOAT, data_num); - __mluop_exp(dst, dst, nullptr, 0, data_num); - __bang_add_scalar(dst, dst, 1.f, data_num); - __mluop_log(dst, dst, nullptr, 0, data_num); - __bang_add(dst, dst, max_value, data_num); - - // If min_value - max_value < MIN_LOG_DIFF_FLOAT, return the larger one - // mask eq with 0x3f800000(float32(1.0)), -> 0xffffffff - __asm__ volatile( - "fuse.nram.s32 [%[dst]], %[size], [%[src0]]," - ".eq(%[src1]), .mul(%[src2]);\n" ::[dst] "r"((int32_t *)mask), - [ size ] "r"(data_num), [ src0 ] "r"((int32_t *)mask), - [ src1 ] "r"(0x3f800000), [ src2 ] "r"(-1)); - __bang_band((char *)dst, (char *)dst, (char *)mask, data_num * sizeof(float)); - - // Reverse the mask bits, ((int)mask+1)*(-1), 0->-1, -1->0 - __bang_fusion(FUSION_FAM, (int *)mask, (int *)mask, 1, -1, data_num); - __bang_band((char *)max_value, (char *)max_value, (char *)mask, - data_num * sizeof(float)); - __bang_add(dst, dst, max_value, data_num); -} - -#endif // KERNELS_MUTUAL_INFORMATION_FORWARD_MUTUAL_INFORMATION_FORWARD_UTILS_H_ // NOLINT diff --git a/kernels/roi_align_backward/roi_align_backward.cpp b/kernels/roi_align_backward/roi_align_backward.cpp deleted file mode 100644 index eda7a1641..000000000 --- a/kernels/roi_align_backward/roi_align_backward.cpp +++ /dev/null @@ -1,109 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "kernels/utils/cnnl_helper.h" - -mluOpStatus_t MLUOP_WIN_API mluOpRoiAlignBackward( - mluOpHandle_t handle, const float spatial_scale, const int sampling_ratio, - const bool aligned, const mluOpTensorDescriptor_t grads_desc, - const void *grads, const mluOpTensorDescriptor_t boxes_desc, - const void *boxes, const mluOpTensorDescriptor_t grads_image_desc, - void *grads_image) { - PARAM_CHECK("mluOpRoiAlignBackward", handle != NULL); - PARAM_CHECK("mluOpRoiAlignBackward", grads_desc != NULL); - PARAM_CHECK("mluOpRoiAlignBackward", grads != NULL); - PARAM_CHECK("mluOpRoiAlignBackward", boxes_desc != NULL); - PARAM_CHECK("mluOpRoiAlignBackward", boxes != NULL); - PARAM_CHECK("mluOpRoiAlignBackward", grads_image_desc != NULL); - PARAM_CHECK("mluOpRoiAlignBackward", grads_image != NULL); - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grads_desc, cnnl_grads_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(boxes_desc, cnnl_boxes_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grads_image_desc, - cnnl_grads_image_desc); - CHECK_FUNC_RETURN( - cnnlRoiAlignBackward(cnnl_handle, spatial_scale, sampling_ratio, aligned, - cnnl_grads_desc, grads, cnnl_boxes_desc, boxes, - cnnl_grads_image_desc, grads_image), - CNNL_STATUS_SUCCESS, - "[mluOpRoiAlignBackward] Internal error accured in " - "mluOpRoiAlignBackward.", - MLUOP_STATUS_INTERNAL_ERROR); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grads_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_boxes_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grads_image_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpRoiAlignBackward_v2( - mluOpHandle_t handle, const mluOpTensorDescriptor_t grads_desc, - const void *grads, const mluOpTensorDescriptor_t boxes_desc, - const void *boxes, const mluOpTensorDescriptor_t argmax_x_desc, - const void *argmax_x, const mluOpTensorDescriptor_t argmax_y_desc, - const void *argmax_y, const float spatial_scale, const int sampling_ratio, - const bool aligned, const int pool_mode, - const mluOpTensorDescriptor_t grads_image_desc, void *grads_image) { - PARAM_CHECK("mluOpRoiAlignBackward_v2", handle != NULL); - PARAM_CHECK("mluOpRoiAlignBackward_v2", grads_desc != NULL); - PARAM_CHECK("mluOpRoiAlignBackward_v2", grads != NULL); - PARAM_CHECK("mluOpRoiAlignBackward_v2", boxes_desc != NULL); - PARAM_CHECK("mluOpRoiAlignBackward_v2", boxes != NULL); - PARAM_CHECK("mluOpRoiAlignBackward_v2", grads_image_desc != NULL); - PARAM_CHECK("mluOpRoiAlignBackward_v2", grads_image != NULL); - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grads_desc, cnnl_grads_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(boxes_desc, cnnl_boxes_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grads_image_desc, - cnnl_grads_image_desc); - - cnnlTensorDescriptor_t cnnl_argmax_x_desc = NULL; - cnnlTensorDescriptor_t cnnl_argmax_y_desc = NULL; - - if (pool_mode == 0) { - PARAM_CHECK("mluOpRoiAlignBackward_v2", argmax_x_desc != NULL); - PARAM_CHECK("mluOpRoiAlignBackward_v2", argmax_x != NULL); - PARAM_CHECK("mluOpRoiAlignBackward_v2", argmax_y_desc != NULL); - PARAM_CHECK("mluOpRoiAlignBackward_v2", argmax_y != NULL); - CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(argmax_x_desc, cnnl_argmax_x_desc); - CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(argmax_y_desc, cnnl_argmax_y_desc); - } - CHECK_FUNC_RETURN(cnnlRoiAlignBackward_v2( - cnnl_handle, cnnl_grads_desc, grads, cnnl_boxes_desc, - boxes, cnnl_argmax_x_desc, argmax_x, cnnl_argmax_y_desc, - argmax_y, spatial_scale, sampling_ratio, aligned, - pool_mode, cnnl_grads_image_desc, grads_image), - CNNL_STATUS_SUCCESS, - "[mluOpRoiAlignBackward_v2] Internal error accured in " - "mluOpRoiAlignBackward_v2.", - MLUOP_STATUS_INTERNAL_ERROR); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grads_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_boxes_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grads_image_desc); - if (pool_mode == 0) { - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_argmax_x_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_argmax_y_desc); - } - DESTROY_CNNL_HANDLE(cnnl_handle); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/roi_pooling/sync_batchnorm/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp b/kernels/roi_pooling/sync_batchnorm/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp deleted file mode 100644 index 9f5068212..000000000 --- a/kernels/roi_pooling/sync_batchnorm/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp +++ /dev/null @@ -1,160 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "kernels/utils/cnnl_helper.h" - -mluOpStatus_t MLUOP_WIN_API mluOpGetSyncBatchnormBackwardReduceWorkspaceSize( - mluOpHandle_t handle, const mluOpTensorDescriptor_t desc_x, - size_t *workspace_size) { - PARAM_CHECK("mluOpSyncBatchnormBackwardReduce_v2", handle != NULL); - PARAM_CHECK("mluOpSyncBatchnormBackwardReduce_v2", desc_x != NULL); - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_x, cnnl_desc_x); - - CHECK_FUNC_RETURN( - cnnlGetSyncBatchnormBackwardReduceWorkspaceSize(cnnl_handle, cnnl_desc_x, - workspace_size), - CNNL_STATUS_SUCCESS, - "[mluOpSyncBatchnormBackwardReduce_v2] Internal error" - " accured in mluOpGetSyncBatchnormBackwardReduceWorkspaceSize.", - MLUOP_STATUS_INTERNAL_ERROR); - - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_x); - DESTROY_CNNL_HANDLE(cnnl_handle); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchnormBackwardReduce( - mluOpHandle_t handle, const mluOpTensorDescriptor_t desc_dz, const void *dz, - const mluOpTensorDescriptor_t desc_x, const void *x, - const mluOpTensorDescriptor_t desc_mean, const void *mean, - const mluOpTensorDescriptor_t desc_invstd, const void *invstd, - const mluOpTensorDescriptor_t desc_dfilter, void *dfilter, - const mluOpTensorDescriptor_t desc_dbias, void *dbias, - const mluOpTensorDescriptor_t desc_sum_dy, void *sum_dy, - const mluOpTensorDescriptor_t desc_sum_dy_xmu, void *sum_dy_xmu, - const bool needs_input_grad0, const bool needs_input_grad1, - const bool needs_input_grad2) { - PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", handle != NULL); - PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_dz != NULL); - PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_x != NULL); - PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_mean != NULL); - PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_invstd != NULL); - PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", dz != NULL); - PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", x != NULL); - PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", mean != NULL); - PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", invstd != NULL); - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_dz, cnnl_desc_dz); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_x, cnnl_desc_x); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_mean, cnnl_desc_mean); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_invstd, cnnl_desc_invstd); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_dfilter, cnnl_desc_dfilter); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_dbias, cnnl_desc_dbias); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_sum_dy, cnnl_desc_sum_dy); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_sum_dy_xmu, - cnnl_desc_sum_dy_xmu); - - CHECK_FUNC_RETURN( - cnnlSyncBatchnormBackwardReduce( - cnnl_handle, cnnl_desc_dz, dz, cnnl_desc_x, x, cnnl_desc_mean, mean, - cnnl_desc_invstd, invstd, cnnl_desc_dfilter, dfilter, cnnl_desc_dbias, - dbias, cnnl_desc_sum_dy, sum_dy, cnnl_desc_sum_dy_xmu, sum_dy_xmu, - needs_input_grad0, needs_input_grad1, needs_input_grad2), - CNNL_STATUS_SUCCESS, - "[mluOpSyncBatchnormBackwardReduce] Internal error" - " accured in mluOpSyncBatchnormBackwardReduce.", - MLUOP_STATUS_INTERNAL_ERROR); - - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_dz); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_x); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_mean); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_invstd); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_dfilter); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_dbias); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_sum_dy); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_sum_dy_xmu); - DESTROY_CNNL_HANDLE(cnnl_handle); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchnormBackwardReduce_v2( - mluOpHandle_t handle, const mluOpTensorDescriptor_t desc_dz, const void *dz, - const mluOpTensorDescriptor_t desc_x, const void *x, - const mluOpTensorDescriptor_t desc_mean, const void *mean, - const mluOpTensorDescriptor_t desc_invstd, const void *invstd, - void *workspace, size_t workspace_size, - const mluOpTensorDescriptor_t desc_dfilter, void *dfilter, - const mluOpTensorDescriptor_t desc_dbias, void *dbias, - const mluOpTensorDescriptor_t desc_sum_dy, void *sum_dy, - const mluOpTensorDescriptor_t desc_sum_dy_xmu, void *sum_dy_xmu, - const bool needs_input_grad0, const bool needs_input_grad1, - const bool needs_input_grad2) { - PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", handle != NULL); - PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_dz != NULL); - PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_x != NULL); - PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_mean != NULL); - PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_invstd != NULL); - PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", dz != NULL); - PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", x != NULL); - PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", mean != NULL); - PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", invstd != NULL); - if (workspace_size > 0) { - PARAM_CHECK("mluOpSyncBatchnormBackwardReduce_v2", workspace != NULL); - } - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_dz, cnnl_desc_dz); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_x, cnnl_desc_x); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_mean, cnnl_desc_mean); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_invstd, cnnl_desc_invstd); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_dfilter, cnnl_desc_dfilter); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_dbias, cnnl_desc_dbias); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_sum_dy, cnnl_desc_sum_dy); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_sum_dy_xmu, - cnnl_desc_sum_dy_xmu); - - CHECK_FUNC_RETURN( - cnnlSyncBatchnormBackwardReduce_v2( - cnnl_handle, cnnl_desc_dz, dz, cnnl_desc_x, x, cnnl_desc_mean, mean, - cnnl_desc_invstd, invstd, workspace, workspace_size, - cnnl_desc_dfilter, dfilter, cnnl_desc_dbias, dbias, cnnl_desc_sum_dy, - sum_dy, cnnl_desc_sum_dy_xmu, sum_dy_xmu, needs_input_grad0, - needs_input_grad1, needs_input_grad2), - CNNL_STATUS_SUCCESS, - "[mluOpSyncBatchnormBackwardReduce] Internal error" - " accured in mluOpSyncBatchnormBackwardReduce_v2.", - MLUOP_STATUS_INTERNAL_ERROR); - - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_dz); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_x); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_mean); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_invstd); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_dfilter); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_dbias); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_sum_dy); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_sum_dy_xmu); - DESTROY_CNNL_HANDLE(cnnl_handle); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/roi_pooling_backward/roi_pooling_backward.cpp b/kernels/roi_pooling_backward/roi_pooling_backward.cpp deleted file mode 100644 index 3fbcf6c57..000000000 --- a/kernels/roi_pooling_backward/roi_pooling_backward.cpp +++ /dev/null @@ -1,65 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "kernels/utils/cnnl_helper.h" - -mluOpStatus_t MLUOP_WIN_API mluOpRoiPoolingBackward( - mluOpHandle_t handle, mluOpPoolingMode_t pooling_mode, - const mluOpTensorDescriptor_t grads_desc, const void *grads, - const mluOpTensorDescriptor_t rois_desc, const void *rois, - const mluOpTensorDescriptor_t argmax_desc, const int *argmax, - const float spatial_scale, const mluOpTensorDescriptor_t grads_image_desc, - void *grads_image) { - PARAM_CHECK("[mluOpRoiPoolingBackward]", handle != NULL); - PARAM_CHECK("[mluOpRoiPoolingBackward]", grads_desc != NULL); - PARAM_CHECK("[mluOpRoiPoolingBackward]", grads != NULL); - PARAM_CHECK("[mluOpRoiPoolingBackward]", rois_desc != NULL); - PARAM_CHECK("[mluOpRoiPoolingBackward]", rois != NULL); - PARAM_CHECK("[mluOpRoiPoolingBackward]", argmax_desc != NULL); - PARAM_CHECK("[mluOpRoiPoolingBackward]", argmax != NULL); - PARAM_CHECK("[mluOpRoiPoolingBackward]", grads_image_desc != NULL); - PARAM_CHECK("[mluOpRoiPoolingBackward]", grads_image != NULL); - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grads_desc, cnnl_grads_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(rois_desc, cnnl_rois_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(argmax_desc, cnnl_argmax_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grads_image_desc, - cnnl_grads_image_desc); - - CHECK_FUNC_RETURN( - cnnlRoiPoolingBackward(cnnl_handle, cnnlPoolingMode_t(pooling_mode), - cnnl_grads_desc, grads, cnnl_rois_desc, rois, - cnnl_argmax_desc, argmax, spatial_scale, - cnnl_grads_image_desc, grads_image), - CNNL_STATUS_SUCCESS, - "[mluOpRoiPoolingBackward] Internal error" - " accured in mluOpRoiPoolingBackward.", - MLUOP_STATUS_INTERNAL_ERROR); - - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grads_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_rois_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_argmax_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grads_image_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/roi_pooling_forward/roi_pooling_forward.cpp b/kernels/roi_pooling_forward/roi_pooling_forward.cpp deleted file mode 100644 index 389842d64..000000000 --- a/kernels/roi_pooling_forward/roi_pooling_forward.cpp +++ /dev/null @@ -1,59 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "kernels/utils/cnnl_helper.h" - -mluOpStatus_t MLUOP_WIN_API mluOpRoiPoolingForward( - mluOpHandle_t handle, mluOpPoolingMode_t pooling_mode, - const mluOpTensorDescriptor_t input_desc, const void *input, - const mluOpTensorDescriptor_t rois_desc, const void *rois, - float spatial_scale, const mluOpTensorDescriptor_t output_desc, - void *output, int *argmax) { - PARAM_CHECK("[mluOpRoiPoolingForward]", handle != NULL); - PARAM_CHECK("[mluOpRoiPoolingForward]", input_desc != NULL); - PARAM_CHECK("[mluOpRoiPoolingForward]", input != NULL); - PARAM_CHECK("[mluOpRoiPoolingForward]", rois_desc != NULL); - PARAM_CHECK("[mluOpRoiPoolingForward]", rois != NULL); - PARAM_CHECK("[mluOpRoiPoolingForward]", output_desc != NULL); - PARAM_CHECK("[mluOpRoiPoolingForward]", output != NULL); - PARAM_CHECK("[mluOpRoiPoolingForward]", argmax != NULL); - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(rois_desc, cnnl_rois_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_desc, cnnl_output_desc); - - CHECK_FUNC_RETURN( - cnnlRoiPoolingForward(cnnl_handle, cnnlPoolingMode_t(pooling_mode), - cnnl_input_desc, input, cnnl_rois_desc, rois, - spatial_scale, cnnl_output_desc, output, argmax), - CNNL_STATUS_SUCCESS, - "[mluOpRoiPoolingForward] Internal error" - " accured in mluOpRoiPoolingForward.", - MLUOP_STATUS_INTERNAL_ERROR); - - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_rois_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/roialign_forward/roialign_forward.cpp b/kernels/roialign_forward/roialign_forward.cpp deleted file mode 100644 index 6e42efde7..000000000 --- a/kernels/roialign_forward/roialign_forward.cpp +++ /dev/null @@ -1,102 +0,0 @@ -/************************************************************************* - * Copyright (C) [2022] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "kernels/utils/cnnl_helper.h" - -mluOpStatus_t MLUOP_WIN_API -mluOpCreateRoiAlignForwardDescriptor(mluOpRoiAlignForwardDescriptor_t *desc) { - PARAM_CHECK("[mluOpRoiAlignForward_v2]", desc != NULL); - CHECK_FUNC_RETURN(cnnlCreateRoiAlignDescriptor(desc), CNNL_STATUS_SUCCESS, - "[mluOpRoiAlignForward_v2] Internal error accured in " - "mluOpCreateRoiAlignForwardDescriptor.", - MLUOP_STATUS_INTERNAL_ERROR); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API -mluOpDestroyRoiAlignForwardDescriptor(mluOpRoiAlignForwardDescriptor_t desc) { - PARAM_CHECK("[mluOpRoiAlignForward_v2]", desc != NULL); - CHECK_FUNC_RETURN(cnnlDestroyRoiAlignDescriptor(desc), CNNL_STATUS_SUCCESS, - "[mluOpRoiAlignForward_v2] Internal error accured in " - "mluOpDestroyRoiAlignForwardDescriptor.", - MLUOP_STATUS_INTERNAL_ERROR); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpSetRoiAlignForwardDescriptor_v2( - mluOpRoiAlignForwardDescriptor_t desc, const int pooled_height, - const int pooled_width, const int sampling_ratio, const float spatial_scale, - const int pool_mode, const bool aligned) { - PARAM_CHECK("[mluOpRoiAlignForward_v2]", desc != NULL); - CHECK_FUNC_RETURN(cnnlSetRoiAlignDescriptor_v2( - desc, pooled_height, pooled_width, sampling_ratio, - spatial_scale, pool_mode, aligned), - CNNL_STATUS_SUCCESS, - "[mluOpRoiAlignForward_v2] Internal error accured in " - "mluOpSetRoiAlignForwardDescriptor_v2.", - MLUOP_STATUS_INTERNAL_ERROR); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpRoiAlignForward_v2( - mluOpHandle_t handle, const mluOpRoiAlignForwardDescriptor_t roialign_desc, - const mluOpTensorDescriptor_t input_desc, const void *input, - const mluOpTensorDescriptor_t boxes_desc, const void *boxes, - const mluOpTensorDescriptor_t output_desc, void *output, - const mluOpTensorDescriptor_t argmax_x_desc, void *argmax_x, - const mluOpTensorDescriptor_t argmax_y_desc, void *argmax_y) { - PARAM_CHECK("mluOpRoiAlignForward_v2", handle != NULL); - PARAM_CHECK("mluOpRoiAlignForward_v2", roialign_desc != NULL); - PARAM_CHECK("mluOpRoiAlignForward_v2", input_desc != NULL); - PARAM_CHECK("mluOpRoiAlignForward_v2", boxes_desc != NULL); - PARAM_CHECK("mluOpRoiAlignForward_v2", output_desc != NULL); - PARAM_CHECK("mluOpRoiAlignForward_v2", input != NULL); - PARAM_CHECK("mluOpRoiAlignForward_v2", boxes != NULL); - PARAM_CHECK("mluOpRoiAlignForward_v2", output != NULL); - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(boxes_desc, cnnl_boxes_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_desc, cnnl_output_desc); - - cnnlTensorDescriptor_t cnnl_argmax_x_desc = NULL; - cnnlTensorDescriptor_t cnnl_argmax_y_desc = NULL; - CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(argmax_x_desc, cnnl_argmax_x_desc); - CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(argmax_y_desc, cnnl_argmax_y_desc); - CHECK_FUNC_RETURN( - cnnlRoiAlign_v2(cnnl_handle, roialign_desc, cnnl_input_desc, input, - cnnl_boxes_desc, boxes, cnnl_output_desc, output, - cnnl_argmax_x_desc, argmax_x, cnnl_argmax_y_desc, - argmax_y), - CNNL_STATUS_SUCCESS, - "[mluOpRoiAlignForward_v2] Internal error" - " accured in mluOpRoiAlignForward_v2.", - MLUOP_STATUS_INTERNAL_ERROR); - - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_boxes_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_argmax_x_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_argmax_y_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/roi_pooling/sync_batchnorm/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp b/kernels/sync_batch_norm/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp similarity index 100% rename from kernels/roi_pooling/sync_batchnorm/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp rename to kernels/sync_batch_norm/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp diff --git a/kernels/roi_pooling/sync_batchnorm/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp b/kernels/sync_batch_norm/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp similarity index 100% rename from kernels/roi_pooling/sync_batchnorm/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp rename to kernels/sync_batch_norm/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp diff --git a/kernels/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp b/kernels/sync_batch_norm/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp similarity index 100% rename from kernels/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp rename to kernels/sync_batch_norm/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp diff --git a/kernels/roi_pooling/sync_batchnorm/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp b/kernels/sync_batch_norm/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp similarity index 100% rename from kernels/roi_pooling/sync_batchnorm/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp rename to kernels/sync_batch_norm/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp diff --git a/kernels/roi_pooling/sync_batchnorm/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp b/kernels/sync_batch_norm/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp similarity index 100% rename from kernels/roi_pooling/sync_batchnorm/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp rename to kernels/sync_batch_norm/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp diff --git a/kernels/roi_pooling/sync_batchnorm/sync_batchnorm_stats/sync_batchnorm_stats.cpp b/kernels/sync_batch_norm/sync_batchnorm_stats/sync_batchnorm_stats.cpp similarity index 100% rename from kernels/roi_pooling/sync_batchnorm/sync_batchnorm_stats/sync_batchnorm_stats.cpp rename to kernels/sync_batch_norm/sync_batchnorm_stats/sync_batchnorm_stats.cpp diff --git a/kernels/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp b/kernels/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp deleted file mode 100644 index 80e6e829c..000000000 --- a/kernels/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp +++ /dev/null @@ -1,83 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "kernels/utils/cnnl_helper.h" - -mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormBackwardElemt( - mluOpHandle_t handle, const mluOpTensorDescriptor_t diff_y_desc, - const void *diff_y, const mluOpTensorDescriptor_t x_desc, const void *x, - const mluOpTensorDescriptor_t mean_desc, const void *mean, - const mluOpTensorDescriptor_t invstd_desc, const void *invstd, - const mluOpTensorDescriptor_t filter_desc, const void *filter, - const mluOpTensorDescriptor_t mean_dy_desc, const void *mean_dy, - const mluOpTensorDescriptor_t mean_dy_xmu_desc, const void *mean_dy_xmu, - const mluOpTensorDescriptor_t diffcnnl_x_desc, void *diff_x) { - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", handle != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", diff_y_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", x_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", mean_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", invstd_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", mean_dy_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", mean_dy_xmu_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", diffcnnl_x_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", diff_y != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", x != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", mean != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", invstd != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", mean_dy != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", mean_dy_xmu != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", diff_x != NULL); - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(diff_y_desc, cnnl_diff_y_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(x_desc, cnnl_x_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mean_desc, cnnl_mean_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(invstd_desc, cnnl_invstd_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filter_desc, cnnl_filter_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mean_dy_desc, cnnl_mean_dy_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mean_dy_xmu_desc, - cnnl_mean_dy_xmu_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(diffcnnl_x_desc, - cnnl_diffcnnl_x_desc); - - CHECK_FUNC_RETURN( - cnnlSyncBatchNormBackwardElemt( - cnnl_handle, cnnl_diff_y_desc, diff_y, cnnl_x_desc, x, cnnl_mean_desc, - mean, cnnl_invstd_desc, invstd, cnnl_filter_desc, filter, - cnnl_mean_dy_desc, mean_dy, cnnl_mean_dy_xmu_desc, mean_dy_xmu, - cnnl_diffcnnl_x_desc, diff_x), - CNNL_STATUS_SUCCESS, - "[mluOpSyncBatchNormBackwardElemt] Internal error" - " accured in mluOpSyncBatchNormBackwardElemt.", - MLUOP_STATUS_INTERNAL_ERROR); - - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_diff_y_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mean_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_invstd_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_filter_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mean_dy_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mean_dy_xmu_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_diffcnnl_x_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp b/kernels/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp deleted file mode 100644 index e7ce0d9b6..000000000 --- a/kernels/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp +++ /dev/null @@ -1,88 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "kernels/utils/cnnl_helper.h" - -mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormBackwardElemtV2( - mluOpHandle_t handle, const mluOpTensorDescriptor_t diff_y_desc, - const void *diff_y, const mluOpTensorDescriptor_t x_desc, const void *x, - const mluOpTensorDescriptor_t mean_desc, const void *mean, - const mluOpTensorDescriptor_t invstd_desc, const void *invstd, - const mluOpTensorDescriptor_t filter_desc, const void *filter, - const mluOpTensorDescriptor_t sum_dy_desc, const void *sum_dy, - const mluOpTensorDescriptor_t sum_dy_xmu_desc, const void *sum_dy_xmu, - const mluOpTensorDescriptor_t count_desc, const void *count, - const mluOpTensorDescriptor_t diffcnnl_x_desc, void *diff_x) { - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", handle != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", diff_y_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", x_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", mean_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", invstd_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", sum_dy_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", sum_dy_xmu_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", count_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", diffcnnl_x_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", diff_y != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", x != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", mean != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", invstd != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", sum_dy != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", sum_dy_xmu != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", count != NULL); - PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", diff_x != NULL); - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(diff_y_desc, cnnl_diff_y_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(x_desc, cnnl_x_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mean_desc, cnnl_mean_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(invstd_desc, cnnl_invstd_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filter_desc, cnnl_filter_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(sum_dy_desc, cnnl_sum_dy_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(sum_dy_xmu_desc, - cnnl_sum_dy_xmu_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(count_desc, cnnl_count_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(diffcnnl_x_desc, - cnnl_diffcnnl_x_desc); - - CHECK_FUNC_RETURN( - cnnlSyncBatchNormBackwardElemtV2( - cnnl_handle, cnnl_diff_y_desc, diff_y, cnnl_x_desc, x, cnnl_mean_desc, - mean, cnnl_invstd_desc, invstd, cnnl_filter_desc, filter, - cnnl_sum_dy_desc, sum_dy, cnnl_sum_dy_xmu_desc, sum_dy_xmu, - cnnl_count_desc, count, cnnl_diffcnnl_x_desc, diff_x), - CNNL_STATUS_SUCCESS, - "[mluOpSyncBatchNormBackwardElemtV2] Internal error" - " accured in mluOpSyncBatchNormBackwardElemtV2.", - MLUOP_STATUS_INTERNAL_ERROR); - - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_diff_y_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mean_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_invstd_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_filter_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_sum_dy_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_sum_dy_xmu_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_count_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_diffcnnl_x_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp b/kernels/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp deleted file mode 100644 index 54d23d574..000000000 --- a/kernels/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp +++ /dev/null @@ -1,73 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "kernels/utils/cnnl_helper.h" - -mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormElemt( - mluOpHandle_t handle, const mluOpTensorDescriptor_t x_desc, const void *x, - const mluOpTensorDescriptor_t mean_desc, const void *mean, - const mluOpTensorDescriptor_t invstd_desc, const void *invstd, - const mluOpTensorDescriptor_t filter_desc, const void *filter, - const mluOpTensorDescriptor_t bias_desc, const void *bias, - const mluOpTensorDescriptor_t y_desc, void *y) { - PARAM_CHECK("[mluOpSyncBatchNormElemt]", handle != NULL); - PARAM_CHECK("[mluOpSyncBatchNormElemt]", x_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormElemt]", mean_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormElemt]", invstd_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormElemt]", - (filter_desc != NULL && bias_desc != NULL) || - (filter_desc == NULL && bias_desc == NULL)); - PARAM_CHECK("[mluOpSyncBatchNormElemt]", y_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormElemt]", x != NULL); - PARAM_CHECK("[mluOpSyncBatchNormElemt]", mean != NULL); - PARAM_CHECK( - "[mluOpSyncBatchNormElemt]", - (filter != NULL && bias != NULL) || (filter == NULL && bias == NULL)); - PARAM_CHECK("[mluOpSyncBatchNormElemt]", invstd != NULL); - PARAM_CHECK("[mluOpSyncBatchNormElemt]", y != NULL); - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(x_desc, cnnl_x_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mean_desc, cnnl_mean_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(invstd_desc, cnnl_invstd_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filter_desc, cnnl_filter_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(bias_desc, cnnl_bias_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(y_desc, cnnl_y_desc); - - CHECK_FUNC_RETURN( - cnnlSyncBatchNormElemt(cnnl_handle, cnnl_x_desc, x, cnnl_mean_desc, mean, - cnnl_invstd_desc, invstd, cnnl_filter_desc, filter, - cnnl_bias_desc, bias, cnnl_y_desc, y), - CNNL_STATUS_SUCCESS, - "[mluOpSyncBatchNormElemt] Internal error" - " accured in mluOpSyncBatchNormElemt.", - MLUOP_STATUS_INTERNAL_ERROR); - - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mean_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_invstd_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_filter_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_bias_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_y_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp b/kernels/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp deleted file mode 100644 index e892d85b5..000000000 --- a/kernels/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp +++ /dev/null @@ -1,89 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "kernels/utils/cnnl_helper.h" - -mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormGatherStatsWithCounts( - mluOpHandle_t handle, const mluOpTensorDescriptor_t mean_all_desc, - const void *mean_all, const mluOpTensorDescriptor_t invstd_all_desc, - const void *invstd_all, const mluOpTensorDescriptor_t movingcnnl_mean_desc, - void *moving_mean, const mluOpTensorDescriptor_t moving_var_desc, - void *moving_var, float momentum, float eps, - const mluOpTensorDescriptor_t count_all_desc, const void *count_all, - const mluOpTensorDescriptor_t mean_desc, void *mean, - const mluOpTensorDescriptor_t invstd_desc, void *invstd) { - PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]", handle != NULL); - PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]", - mean_all_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]", - invstd_all_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]", - count_all_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]", mean_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]", invstd_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]", - (movingcnnl_mean_desc != NULL && moving_var_desc != NULL) || - (movingcnnl_mean_desc == NULL && moving_var_desc == NULL)); - PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]", mean_all != NULL); - PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]", invstd_all != NULL); - PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]", - (moving_mean != NULL && moving_var != NULL) || - (moving_mean == NULL && moving_var == NULL)); - PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]", count_all != NULL); - PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]", mean != NULL); - PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]", invstd != NULL); - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mean_all_desc, - cnnl_mean_all_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(invstd_all_desc, - cnnl_invstd_all_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(movingcnnl_mean_desc, - cnnl_movingcnnl_mean_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(moving_var_desc, - cnnl_moving_var_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(count_all_desc, - cnnl_count_all_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mean_desc, cnnl_mean_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(invstd_desc, cnnl_invstd_desc); - - CHECK_FUNC_RETURN( - cnnlSyncBatchNormGatherStatsWithCounts( - cnnl_handle, cnnl_mean_all_desc, mean_all, cnnl_invstd_all_desc, - invstd_all, cnnl_movingcnnl_mean_desc, moving_mean, - cnnl_moving_var_desc, moving_var, momentum, eps, cnnl_count_all_desc, - count_all, cnnl_mean_desc, mean, cnnl_invstd_desc, invstd), - CNNL_STATUS_SUCCESS, - "[mluOpSyncBatchNormGatherStatsWithCounts] Internal error" - " accured in mluOpSyncBatchNormGatherStatsWithCounts.", - MLUOP_STATUS_INTERNAL_ERROR); - - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mean_all_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_invstd_all_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_movingcnnl_mean_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_moving_var_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_count_all_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mean_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_invstd_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - return MLUOP_STATUS_SUCCESS; -} diff --git a/kernels/sync_batchnorm_stats/sync_batchnorm_stats.cpp b/kernels/sync_batchnorm_stats/sync_batchnorm_stats.cpp deleted file mode 100644 index 35d53cf85..000000000 --- a/kernels/sync_batchnorm_stats/sync_batchnorm_stats.cpp +++ /dev/null @@ -1,112 +0,0 @@ -/************************************************************************* - * Copyright (C) [2023] by Cambricon, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - *************************************************************************/ -#include "kernels/utils/cnnl_helper.h" - -mluOpStatus_t MLUOP_WIN_API mluOpGetSyncBatchNormStatsWorkspaceSize( - mluOpHandle_t handle, const mluOpTensorDescriptor_t x_desc, - size_t *workspace_size) { - PARAM_CHECK("mluOpSyncBatchNormStats_v2", handle != NULL); - PARAM_CHECK("mluOpSyncBatchNormStats_v2", x_desc != NULL); - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(x_desc, cnnl_x_desc); - - CHECK_FUNC_RETURN(cnnlGetSyncBatchNormStatsWorkspaceSize( - cnnl_handle, cnnl_x_desc, workspace_size), - CNNL_STATUS_SUCCESS, - "[mluOpSyncBatchNormStats_v2] Internal error" - " accured in mluOpGetSyncBatchNormStatsWorkspaceSize.", - MLUOP_STATUS_INTERNAL_ERROR); - - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormStats( - mluOpHandle_t handle, const mluOpTensorDescriptor_t x_desc, const void *x, - const float eps, const mluOpTensorDescriptor_t mean_desc, void *mean, - const mluOpTensorDescriptor_t invstd_desc, void *invstd) { - PARAM_CHECK("[mluOpSyncBatchNormStats]", handle != NULL); - PARAM_CHECK("[mluOpSyncBatchNormStats]", x_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormStats]", mean_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormStats]", invstd_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormStats]", x != NULL); - PARAM_CHECK("[mluOpSyncBatchNormStats]", mean != NULL); - PARAM_CHECK("[mluOpSyncBatchNormStats]", invstd != NULL); - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(x_desc, cnnl_x_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mean_desc, cnnl_mean_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(invstd_desc, cnnl_invstd_desc); - - CHECK_FUNC_RETURN( - cnnlSyncBatchNormStats(cnnl_handle, cnnl_x_desc, x, eps, cnnl_mean_desc, - mean, cnnl_invstd_desc, invstd), - CNNL_STATUS_SUCCESS, - "[cnnlSyncBatchNormStats] Internal error" - " accured in cnnlSyncBatchNormStats.", - MLUOP_STATUS_INTERNAL_ERROR); - - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mean_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_invstd_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - return MLUOP_STATUS_SUCCESS; -} - -mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormStats_v2( - mluOpHandle_t handle, const mluOpTensorDescriptor_t x_desc, const void *x, - void *workspace, size_t workspace_size, const float eps, - const mluOpTensorDescriptor_t mean_desc, void *mean, - const mluOpTensorDescriptor_t invstd_desc, void *invstd) { - PARAM_CHECK("[mluOpSyncBatchNormStats_v2]", handle != NULL); - PARAM_CHECK("[mluOpSyncBatchNormStats_v2]", x_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormStats_v2]", mean_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormStats_v2]", invstd_desc != NULL); - PARAM_CHECK("[mluOpSyncBatchNormStats_v2]", x != NULL); - PARAM_CHECK("[mluOpSyncBatchNormStats_v2]", mean != NULL); - PARAM_CHECK("[mluOpSyncBatchNormStats_v2]", invstd != NULL); - if (workspace_size > 0) { - PARAM_CHECK("mluOpSyncBatchNormStats_v2", workspace != NULL); - } - - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(x_desc, cnnl_x_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mean_desc, cnnl_mean_desc); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(invstd_desc, cnnl_invstd_desc); - - CHECK_FUNC_RETURN(cnnlSyncBatchNormStats_v2( - cnnl_handle, cnnl_x_desc, x, workspace, workspace_size, - eps, cnnl_mean_desc, mean, cnnl_invstd_desc, invstd), - CNNL_STATUS_SUCCESS, - "[cnnlSyncBatchNormStats_v2] Internal error" - " accured in cnnlSyncBatchNormStats_v2.", - MLUOP_STATUS_INTERNAL_ERROR); - - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mean_desc); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_invstd_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - return MLUOP_STATUS_SUCCESS; -} diff --git a/mlu_op.h b/mlu_op.h index 3f61b45ed..e7331425e 100644 --- a/mlu_op.h +++ b/mlu_op.h @@ -1235,7 +1235,7 @@ typedef struct mluOpCarafeStruct *mluOpCarafeDescriptor_t; mluOpStatus_t MLUOP_WIN_API mluOpCreateTensorDescriptor(mluOpTensorDescriptor_t *desc); -// Group: GetIndicePairs +// Group:SparseConv /*! * @brief Creates a tensor descriptor pointed by \b desc that holds the dimensions, pad, stride, * dilation, sub_m, transpose, inverse and layout of input filter and output tensor shape. @@ -1274,7 +1274,7 @@ mluOpCreateTensorDescriptor(mluOpTensorDescriptor_t *desc); mluOpStatus_t MLUOP_WIN_API mluOpCreateSparseConvolutionDescriptor(mluOpSparseConvolutionDescriptor_t *desc); -// Group: GetIndicePairs +// Group:SparseConv /*! * @brief Destroys a convolution descriptor \b desc that was previously created with the * ::mluOpCreateSparseConvolutionDescriptor function. @@ -1517,7 +1517,7 @@ mluOpSetTensorDescriptor_v2(mluOpTensorDescriptor_t desc, int dimNb, const int64_t dimSize[]); -// Group: GetIndicePairs +// Group:SparseConv /*! * @brief Initializes the sparse convolution descriptor \b desc that was previously created * with ::mluOpCreateSparseConvolutionDescriptor, and sets the information @@ -1620,7 +1620,7 @@ mluOpSetSparseConvolutionDescriptor(mluOpSparseConvolutionDescriptor_t desc, const int transpose, const int inverse); -// Group: GetIndicePairs +// Group:SparseConv /*! * @brief Obtains the parameter num_act_out from ::mluOpSparseConvolutionDescriptor_t. * @@ -3487,7 +3487,7 @@ mluOpDiv(mluOpHandle_t handle, const mluOpTensorDescriptor_t z_desc, void *z); -// Group: DynamicPointToVoxelBackward +// Group:DynamicPointToVoxel /*! * @brief Gets extra space size for the DynamicPointToVoxelBackward operation. * @@ -3551,7 +3551,7 @@ mluOpGetDynamicPointToVoxelBackwardWorkspaceSize(const mluOpHandle_t handle, const mluOpTensorDescriptor_t voxel_num_desc, size_t *workspace_size); -// Group: DynamicPointToVoxelBackward +// Group:DynamicPointToVoxel /*! * @brief Performs the back-propagation of DynamicPointToVoxelForward * operation to compute the gradient for input \b grad_voxel_feats @@ -4834,7 +4834,7 @@ mluOpPsRoiPoolBackward(mluOpHandle_t handle, const mluOpTensorDescriptor_t bottom_grad_desc, void *bottom_grad); -// Group: RoiAlignForward +// Group:RoiAlign /*! * @brief Creates a descriptor pointed by \b desc for ::mluOpRoiAlignForward_v2, * and allocates memory for holding the information about the function. @@ -4872,7 +4872,7 @@ mluOpPsRoiPoolBackward(mluOpHandle_t handle, mluOpStatus_t MLUOP_WIN_API mluOpCreateRoiAlignForwardDescriptor(mluOpRoiAlignForwardDescriptor_t *desc); -// Group: RoiAlignForward +// Group:RoiAlign /*! * @brief Initializes the descriptor \b desc that was previously created with * ::mluOpCreateRoiAlignForwardDescriptor function, and sets RoiAlign information @@ -4932,7 +4932,7 @@ mluOpSetRoiAlignForwardDescriptor_v2(mluOpRoiAlignForwardDescriptor_t roialign_d const int pool_mode, const bool aligned); -// Group: RoiAlignForward +// Group:RoiAlign /*! * @brief Destroys a RoiAlign descriptor \b desc that was previously created * with ::mluOpCreateRoiAlignForwardDescriptor function. @@ -4972,7 +4972,7 @@ mluOpSetRoiAlignForwardDescriptor_v2(mluOpRoiAlignForwardDescriptor_t roialign_d mluOpStatus_t MLUOP_WIN_API mluOpDestroyRoiAlignForwardDescriptor(mluOpRoiAlignForwardDescriptor_t desc); -// Group: RoiAlignForward +// Group:RoiAlign /*! * @brief Computes the output feature map \b output based on the input feature map \b input * and bounding boxes \b boxes to perform this function. This function supports @@ -6500,7 +6500,7 @@ mluOpBboxOverlaps(mluOpHandle_t handle, const mluOpTensorDescriptor_t ious_desc, void *ious); -// Group: ThreeInterpolate +// Group:ThreeInterpolate /*! * @brief Computes weighted linear interpolation on 3 points by using * 3 indices in \b indices to select 3 points in \b features, uses the @@ -6596,7 +6596,7 @@ mluOpThreeInterpolateForward(mluOpHandle_t handle, const mluOpTensorDescriptor_t output_desc, void *output); -// Group: ThreeInterpolate +// Group:ThreeInterpolate /*! * @brief Computes the gradients of feature map \b grad_features based on the * inputs \b grad_output , \b indices , and \b weights to perform the backpropagation @@ -6685,7 +6685,7 @@ mluOpThreeInterpolateBackward(mluOpHandle_t handle, const mluOpTensorDescriptor_t grad_features_desc, void *grad_features); -// Group: Ballquery +// Group:BallQuery /*! * @brief Takes the point's index in the \b new_xyz set as the center of the sphere, * uses \b min_radius and \b max_radius as the radius, and returns the \b idx of @@ -7173,7 +7173,7 @@ mluOpMaskedIm2colForward(mluOpHandle_t handle, const mluOpTensorDescriptor_t data_col_desc, void *data_col); -// Group: MoeDispatchBackwardData +// Group:MoeDispatch /*! * @brief Calculates the inverse gradient of \b input tensor, and returns the results in the output * tensor \b grad_input. @@ -7395,7 +7395,7 @@ mluOpMsDeformAttnBackward(mluOpHandle_t handle, const mluOpTensorDescriptor_t grad_attn_weight_desc, void *grad_attn_weight); -// Group: MutualInformationBackward +// Group:MutualInformation /*! * @brief Returns the size of the MLU memory as an extra workspace * to optimize ::mluOpMutualInformationBackward. @@ -7457,7 +7457,7 @@ mluOpGetMutualInformationBackwardWorkspaceSize(mluOpHandle_t handle, const bool overwrite_ans_grad, size_t *workspace_size); -// Group: MutualInformationBackward +// Group:MutualInformation /*! * @brief Computes the gradients of tensor \b px and tensor \b py. * @@ -7575,7 +7575,7 @@ mluOpMutualInformationBackward(mluOpHandle_t handle, const mluOpTensorDescriptor_t py_grad_desc, void *py_grad); -// Group: MutualInformationForward +// Group:MutualInformation /*! * @brief Returns the size of the MLU memory as an extra workspace * to optimize ::mluOpMutualInformationForward. @@ -7634,7 +7634,7 @@ mluOpGetMutualInformationForwardWorkspaceSize(mluOpHandle_t handle, const mluOpTensorDescriptor_t ans_desc, size_t *workspace_size); -// Group: MutualInformationForward +// Group:MutualInformation /*! * @brief Computes mutual information between tensor \b px and tensor \b py. * @@ -8450,7 +8450,7 @@ mluOpPsamaskBackward(mluOpHandle_t handle, const mluOpTensorDescriptor_t dx_desc, void *dx); -// Group: GetIndicePairs +// Group:SparseConv /*! * @brief Computes the get_indice_paris operation, then returns the results in the output * tensor \b out_indices , \b indice_pairs and \b ind, ice_num. @@ -8549,7 +8549,7 @@ mluOpGetIndicePairs(mluOpHandle_t handle, const mluOpTensorDescriptor_t indice_num_desc, void *indice_num); -// Group: GetIndicePairs +// Group:SparseConv /*! * @brief Returns in \b workspace_size the size of the MLU memory that is used as an extra workspace * to optimize the get_indice_pairs operation. @@ -8620,7 +8620,7 @@ mluOpGetIndicePairsWorkspaceSize(mluOpHandle_t handle, const mluOpTensorDescriptor_t indice_num_desc, size_t *workspace_size); -// Group: ActiveRotatedFilterForward +// Group:ActiveRotatedFilter /*! * @brief Returns in \b workspace_size the size of the MLU memory that is used as an extra * workspace to optimize ::mluOpActiveRotatedFilterForward. The size of the extra @@ -8666,7 +8666,7 @@ mluOpGetActiveRotatedFilterForwardWorkspaceSize(const mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, size_t *workspace_size); -// Group: ActiveRotatedFilterForward +// Group:ActiveRotatedFilter /*! * @brief Rotates \b input according to \b indices. This function encodes * the orientation information and generates orientation-sensitive features. @@ -9276,7 +9276,7 @@ mluOpBorderAlignBackward(mluOpHandle_t handle, const mluOpTensorDescriptor_t grad_input_desc, void *grad_input); -// Group: IndiceConvolutionBackwardData +// Group:SparseConv /*! * @brief Returns in \b workspace_size the size of the MLU memory that is used as * an extra workspace to optimize the indice convolution backward data operation. @@ -9349,7 +9349,7 @@ mluOpGetIndiceConvolutionBackwardDataWorkspaceSize(mluOpHandle_t handle, const int64_t inverse, size_t *workspace_size); -// Group: IndiceConvolutionBackwardData +// Group:SparseConv /*! * @brief Performs the back propagation of an indice convolution operation to * compute the gradient of input \b input_grad based on the gradient of response @@ -9489,7 +9489,7 @@ mluOpIndiceConvolutionBackwardData(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_grad_desc, void *input_grad); -// Group: IndiceConvolutionBackwardFilter +// Group:SparseConv /*! * @brief Returns in \b workspace_size the size of the MLU memory that is used as an extra workspace * to optimize the indice_convolution_backward_filter operation. @@ -9562,7 +9562,7 @@ mluOpGetIndiceConvolutionBackwardFilterWorkspaceSize(mluOpHandle_t handle, const int64_t sub_m, size_t *workspace_size); -// Group: IndiceConvolutionBackwardFilter +// Group:SparseConv /*! * @brief Computes the indice_convolution_backward_filter operation, then returns the results in the output * tensor \b filters_grad. @@ -9843,7 +9843,7 @@ mluOpRoiPointPool3d(mluOpHandle_t handle, const mluOpTensorDescriptor_t pooled_empty_flag_desc, void *pooled_empty_flag); -// Group: ThreeNNForward +// Group:ThreeNN /*! * @brief Returns in \b workspace_size the size of the MLU memory that is used as an extra * workspace to optimize ::mluOpThreeNNForward. The size of the extra workspace is @@ -9889,7 +9889,7 @@ mluOpGetThreeNNForwardWorkspaceSize(const mluOpHandle_t handle, const mluOpTensorDescriptor_t known_desc, size_t *workspace_size); -// Group: ThreeNNForward +// Group:ThreeNN /*! * @brief Finds the closest 3 points of \b unknown among \b known, and outputs \b dist and index * \b idx tensor. This function firstly computes dist of each known point to a unknown point, and @@ -9971,7 +9971,7 @@ mluOpThreeNNForward(const mluOpHandle_t handle, const mluOpTensorDescriptor_t idx_desc, void *idx); -// Group: IndiceConvolutionForward +// Group:SparseConv /*! * @brief Returns in \b workspace_size of the MLU memory which is used as an extra workspace * to boost up indice_convolution_forward computation. @@ -10047,7 +10047,7 @@ mluOpGetIndiceConvolutionForwardWorkspaceSize(mluOpHandle_t handle, const int64_t sub_m, size_t *workspace_size); -// Group: IndiceConvolutionForward +// Group:SparseConv /*! * @brief Performs convolution on input sparse tensor \b features with kernel \b filters, * then returns the output sparse tensor \b features_out. @@ -10164,7 +10164,7 @@ mluOpIndiceConvolutionForward(mluOpHandle_t handle, const mluOpTensorDescriptor_t features_out_desc, void *features_out); -// Group: MoeDispatchForward +// Group:MoeDispatch /*! * @brief Dispatches the order of \b input tensor, and returns the * results in the output tensor \b dispatch in the MoE algorithm. @@ -10270,7 +10270,7 @@ mluOpMoeDispatchForward(mluOpHandle_t handle, const mluOpTensorDescriptor_t dispatch_desc, void *dispatch); -// Group: MoeDispatchBackwardGate +// Group:MoeDispatch /*! * @brief Returns in \b workspace_size the size of the MLU memory that is used as an extra workspace * to optimize the moe_dispatch_backward_gate operation. @@ -10317,7 +10317,7 @@ mluOpGetMoeDispatchBackwardGateWorkspaceSize(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, size_t *workspace_size); -// Group: MoeDispatchBackwardGate +// Group:MoeDispatch /*! * @brief Calculates the inverse gradient of \b gates tensor, and returns the results in the output * tensor \b grad_gates. @@ -10497,7 +10497,7 @@ mluOpPointsInBoxes(mluOpHandle_t handle, const mluOpTensorDescriptor_t points_indices_desc, void *points_indices); -// Group: RoiAlignBackward +// Group:RoiAlign /*! * @brief Computes the gradients of images \b grads_image using the gradients \b grads and * bounding boxes \b boxes to perform the backpropagation of ::mluOpRoiAlignForward_v2 @@ -10593,7 +10593,7 @@ mluOpRoiAlignBackward(mluOpHandle_t handle, const mluOpTensorDescriptor_t grads_image_desc, void *grads_image); -// Group: RoiAlignBackward +// Group:RoiAlign /*! * @brief Computes the gradients of images \b grads_image based on the gradients \b grads, * bounding boxes \b boxes, the coordinate of x axis \b argmax_x, and the coordinate of y axis @@ -10731,7 +10731,7 @@ mluOpRoiAlignBackward_v2(mluOpHandle_t handle, const mluOpTensorDescriptor_t grads_image_desc, void *grads_image); -// Group: MsDeformAttnForward +// Group:MsDeformAttn /*! * @brief Implements a multi-scale deformable attention module used in Deformable-Detr. * For detailed information about Deformable-Detr, see "Deformable DETR: Deformable @@ -10938,7 +10938,7 @@ mluOpTinShiftForward(mluOpHandle_t handle, const mluOpTensorDescriptor_t output_desc, void *output); -// Group: MaskedCol2im +// Group:MaskedIm2Col /*! * @brief Returns in \b workspace_size the size of the MLU memory that is used as an extra workspace to * optimize the MaskedCol2imForward operation. @@ -10993,7 +10993,7 @@ mluOpGetMaskedCol2imForwardWorkspaceSize(mluOpHandle_t handle, const mluOpTensorDescriptor_t im_desc, size_t *workspace_size); -// Group: MaskedCol2im +// Group:MaskedIm2Col /*! * @brief Copies the data of the input tensor \b col to the special coordinates by combining \b mask_h_idx tensor * and \b mask_w_idx tensor of output tensor \b im. @@ -11085,7 +11085,7 @@ mluOpMaskedCol2imForward(mluOpHandle_t handle, const mluOpTensorDescriptor_t im_desc, void *im); -// Group: DiffIouRotatedSortVerticesForward +// Group:DiffIouRotatedSortVertices /*! * @brief Sorts the effective vertices of the polygon formed by the intersection of two boxes, * and outputs the sorted vertex index. @@ -11157,7 +11157,7 @@ mluOpDiffIouRotatedSortVerticesForward(mluOpHandle_t handle, const mluOpTensorDescriptor_t idx_desc, void *idx); -// Group: RoiPoolingForward +// Group:RoiPooling /*! * @brief Generates a fixed size feature map and input feature index * of argmax for each ROI (Regions of Interest) to perform ::mluOpRoiPoolingForward operation. @@ -11304,7 +11304,7 @@ mluOpRoiPoolingForward(mluOpHandle_t handle, void *output, int *argmax); -// Group: RoiPoolingBackward +// Group:RoiPooling /*! * @brief Computes the gradients of image \b grads_image based on the gradients \b grads and * region proposals \b rois to perform the backpropagation of ::mluOpRoiPoolingForward operation. @@ -11406,7 +11406,7 @@ mluOpRoiPoolingBackward(mluOpHandle_t handle, const mluOpTensorDescriptor_t grads_image_desc, void *grads_image); -// Group: SyncBatchNormStats +// Group:SyncBatchNorm /*! * @brief Returns in \b workspace_size the size of the MLU memory that is used as an extra * workspace to optimize ::mluOpSyncBatchNormStats_v2 operation. @@ -11454,7 +11454,7 @@ mluOpGetSyncBatchNormStatsWorkspaceSize(mluOpHandle_t handle, const mluOpTensorDescriptor_t x_desc, size_t *workspace_size); -// Group: SyncBatchNormStats +// Group:SyncBatchNorm /*! * @brief Computes the local mean and the local inverse standard deviation for each channel * across a batch of data in the training scenario. @@ -11549,7 +11549,7 @@ mluOpSyncBatchNormStats_v2(mluOpHandle_t handle, const mluOpTensorDescriptor_t invstd_desc, void *invstd); -// Group: SyncBatchNormStats +// Group:SyncBatchNorm /*! * @brief Computes the local mean and the local inverse standard deviation for each channel * across a batch of data in the training scenario. @@ -11633,7 +11633,7 @@ mluOpSyncBatchNormStats(mluOpHandle_t handle, const mluOpTensorDescriptor_t invstd_desc, void *invstd); -// Group: SyncBatchNormGatherStatsWithCounts +// Group:SyncBatchNorm /*! * @brief Computes the global mean and the global inverse standard deviation across aggregation * of the local mean and local inverse standard deviation of multiple MLU devices. @@ -11762,7 +11762,7 @@ mluOpSyncBatchNormGatherStatsWithCounts(mluOpHandle_t handle, const mluOpTensorDescriptor_t invstd_desc, void *invstd); -// Group: SyncBatchNormElemt +// Group:SyncBatchNorm /*! * @brief Applies Batch Normalization for each channel across a batch of data with the given mean, * inverse variance and scaling factors. @@ -12641,7 +12641,7 @@ mluOpSyncBatchnormBackwardReduce(mluOpHandle_t handle, const bool needs_input_grad1, const bool needs_input_grad2); -// Group: SyncBatchNormBackwardElemt +// Group:SyncBatchNorm /*! * @brief Computes the gradients of input in the training scenario. * @@ -12769,7 +12769,7 @@ mluOpSyncBatchNormBackwardElemt(mluOpHandle_t handle, const mluOpTensorDescriptor_t diff_x_desc, void *diff_x); -// Group: SyncBatchNormBackwardElemt +// Group:SyncBatchNorm /*! * @brief Computes the gradients of input in the training scenario. *