Skip to content

Commit

Permalink
[Feature](mlu-ops): add stride tensor check (#1066)
Browse files Browse the repository at this point in the history
Co-authored-by: fuzhouxiang <[email protected]>
  • Loading branch information
ZhouxiangZhouxiang and fuzhouxiang authored Aug 5, 2024
1 parent 2cd5bc6 commit 240c7c3
Show file tree
Hide file tree
Showing 73 changed files with 946 additions and 366 deletions.
26 changes: 19 additions & 7 deletions core/logging.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,10 @@
#include <string>
#include <limits>
#include <sstream>
#include "core/macros.h"

#include "core/cnlog.hpp"
#include "core/macros.h"
#include "core/util.h"
#include "mlu_op.h"

#define LARGE_TENSOR_NUM ((uint64_t)2147483648)
Expand Down Expand Up @@ -216,9 +218,17 @@ extern bool mluop_check_large_tensor_dim_size_;
return MLUOP_STATUS_NOT_SUPPORTED; \
}

void mluOpCheck(mluOpStatus_t result, char const *const func,
const char *const file, int const line);
#define MLUOP_CHECK(val) mluOpCheck((val), #val, __FILE__, __LINE__)
#define STRIDE_TENSOR_CHECK(api, desc, reason) \
if (MLUOP_PREDICT_TRUE(desc != NULL)) { \
if (MLUOP_PREDICT_FALSE( \
MLUOP_PREDICT_TRUE(0 != mluOpGetTensorElementNum(desc)) && \
isStrideTensor(desc->dim, desc->dims, desc->strides))) { \
LOG(ERROR) << api << " stride tensor is not supported. " << reason; \
return MLUOP_STATUS_NOT_SUPPORTED; \
} \
}

#define MLUOP_CHECK(val) ((val), #val, __FILE__, __LINE__)

#define KERNEL_CALL_CHECK(parent_kernel, sub_kernel, status, statement) \
do { \
Expand Down Expand Up @@ -274,9 +284,11 @@ struct Voidifier {
return vmodule_activated; \
})(lvl, __FILE__))

#define VLOG(level) \
MLUOP_PREDICT_TRUE(!VLOG_IS_ON(({ static_assert \
(level > 0, "VLOG level should be greater than 0"); level; }))) \
#define VLOG(level) \
MLUOP_PREDICT_TRUE(!VLOG_IS_ON(({ \
static_assert(level > 0, "VLOG level should be greater than 0"); \
level; \
}))) \
? (void)0 : ::mluop::internal::Voidifier() & LOG(VLOG)

// This formats a value for a failing CHECK_XX statement. Ordinarily,
Expand Down
20 changes: 18 additions & 2 deletions core/util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@
*************************************************************************/

#include <string>
#include <stdexcept>
#include "mlu_op.h"

#include "core/logging.h"
#include "core/util.h"
#include "mlu_op.h"

void mluOpCheck(mluOpStatus_t result, char const *const func,
const char *const file, int const line) {
Expand All @@ -38,3 +39,18 @@ void mluOpCheck(mluOpStatus_t result, char const *const func,
throw std::runtime_error(error);
}
}

bool isStrideTensor(const int dim, const int64_t *dims,
const int64_t *strides) {
int64_t stride_base = 1;

for (int i = dim - 1; i >= 0; i--) {
if (dims[i] != 1 && strides[i] != stride_base) {
return true;
}

stride_base *= dims[i];
}

return false;
}
34 changes: 34 additions & 0 deletions core/util.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*************************************************************************
* Copyright (C) [2022] by Cambricon, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/

#ifndef CORE_UTIL_H_
#define CORE_UTIL_H_

#include "mlu_op.h"

void mluOpCheck(mluOpStatus_t result, char const *const func,
const char *const file, int const line);

bool isStrideTensor(const int dim, const int64_t *dims, const int64_t *strides);

#endif // CORE_UTIL_H_
8 changes: 8 additions & 0 deletions kernels/active_rotated_filter/active_rotated_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,14 @@ static mluOpStatus_t activeRotatedFilterForwardParamCheck(
PARAM_CHECK(api_name, (output_desc->dims[1] ==
input_desc->dims[1] * input_desc->dims[2]));

// check stride
STRIDE_TENSOR_CHECK(api_name + ":", input_desc,
"input_desc must be contiguous");
STRIDE_TENSOR_CHECK(api_name + ":", indices_desc,
"indices_desc must be contiguous");
STRIDE_TENSOR_CHECK(api_name + ":", output_desc,
"output_desc must be contiguous");

// check tensor datatype, support float16 and float32
PARAM_CHECK_V2(api_name,
(input_desc->dtype == MLUOP_DTYPE_HALF) ||
Expand Down
33 changes: 24 additions & 9 deletions kernels/adam_w/adam_w.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,22 @@ mluOpAdamW(mluOpHandle_t handle, const mluOpAdamWDescriptor_t adamw_desc,
PARAM_CHECK("[mluOpAdamW]", velocity != nullptr);
PARAM_CHECK("[mluOpAdamW]", grad != nullptr);

// stride check
if (param_desc != nullptr) {
STRIDE_TENSOR_CHECK("[mluOpAdamW]:", param_desc,
"param_desc must be contiguous");
}
if (paramh_desc != nullptr) {
STRIDE_TENSOR_CHECK("[mluOpAdamW]:", paramh_desc,
"paramh_desc must be contiguous");
}
STRIDE_TENSOR_CHECK("[mluOpAdamW]:", momentum_desc,
"momentum_desc must be contiguous");
STRIDE_TENSOR_CHECK("[mluOpAdamW]:", velocity_desc,
"velocity_desc must be contiguous");
STRIDE_TENSOR_CHECK("[mluOpAdamW]:", grad_desc,
"grad_desc must be contiguous");

// generate adam prototxt start!
if (MLUOP_GEN_CASE_ON_NEW) {
GEN_CASE_START("adamw", "ADAMW");
Expand Down Expand Up @@ -246,17 +262,16 @@ mluOpAdamW(mluOpHandle_t handle, const mluOpAdamWDescriptor_t adamw_desc,
return MLUOP_STATUS_ARCH_MISMATCH;
}
case CNRT_FUNC_TYPE_UNION1: {
VLOG(5) << "Launch Kernel KernelApplyAdamW<<<Union"
<< k_type / CORE_DIM << ", " << k_dim.x << ", " << k_dim.y << ", "
<< k_dim.z << ">>>";
VLOG(5) << "Launch Kernel KernelApplyAdamW<<<Union" << k_type / CORE_DIM
<< ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
CHECK_RETURN(
"[mluOpAdamW]",
KernelApplyAdamW(
k_dim, k_type, handle->queue, (void *)param, (void *)param_h,
(void *)grad, (void *)momentum, (void *)velocity, lr, beta1,
beta2, bias1, bias2, epsilon, adamw_desc->weight_decay,
adamw_desc->grad_scale, adamw_desc->use_nesterov, size,
k_data_type));
KernelApplyAdamW(k_dim, k_type, handle->queue, (void *)param,
(void *)param_h, (void *)grad, (void *)momentum,
(void *)velocity, lr, beta1, beta2, bias1, bias2,
epsilon, adamw_desc->weight_decay,
adamw_desc->grad_scale, adamw_desc->use_nesterov,
size, k_data_type));
}
}
GEN_CASE_END();
Expand Down
8 changes: 8 additions & 0 deletions kernels/ball_query/ball_query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,14 @@ mluOpStatus_t MLUOP_WIN_API mluOpBallQuery(
PARAM_CHECK("[mluOpBallQuery]", xyz_desc->dims[2] == 3);
PARAM_CHECK("[mluOpBallQuery]", idx_desc->dims[2] == nsample);

// check stride
STRIDE_TENSOR_CHECK("[mluOpBallQuery]:", new_xyz_desc,
"new_xyz_desc must be contiguous");
STRIDE_TENSOR_CHECK("[mluOpBallQuery]:", xyz_desc,
"xyz_desc must be contiguous");
STRIDE_TENSOR_CHECK("[mluOpBallQuery]:", idx_desc,
"idx_desc must be contiguous");

// check dtype
if (!isSupportType(new_xyz_desc->dtype, support_type, 2)) {
LOG(ERROR) << "[mluOpBallQuery]:Only half and float are supported in input "
Expand Down
5 changes: 5 additions & 0 deletions kernels/bbox_overlaps/bbox_overlaps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,11 @@ mluOpStatus_t MLUOP_WIN_API mluOpBboxOverlaps(
PARAM_CHECK(API, bbox1_desc->dim == 2);
PARAM_CHECK(API, bbox2_desc->dim == 2);

// stride check
STRIDE_TENSOR_CHECK(API + ":", bbox1_desc, "bbox1_desc must be contiguous");
STRIDE_TENSOR_CHECK(API + ":", bbox2_desc, "bbox2_desc must be contiguous");
STRIDE_TENSOR_CHECK(API + ":", ious_desc, "ious_desc must be contiguous");

// param check
if (mode != 1 && mode != 0) {
LOG(ERROR) << "[mluOpBboxOverlaps] Check failed: The mode must be 0 or 1, "
Expand Down
2 changes: 1 addition & 1 deletion kernels/binary_op/binary_op_3pipeline.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@
rem * sizeof(DType_out), NRAM2GDRAM); \
pvUnlock(); \
} \
}
}

// Divide tasks in host
#define BINARY_OP_KERNEL_3PIPELINE_V2_DECLARE(Op, Prefer) \
Expand Down
8 changes: 4 additions & 4 deletions kernels/binary_op/binary_op_5pipeline.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,17 @@

/****************************************************************************
* GDRAM2SRAM: io pipeline
* SRAM2NRAM :mo pipeline
* In Cambricon hardware, The io, compute, and move pipeline have their own
* instruction queues and can be launched in parallel,
* SRAM2NRAM :mo pipeline
* In Cambricon hardware, The io, compute, and move pipeline have their own
* instruction queues and can be launched in parallel,
* so the time of io, compute and move can cover each other.
* The five pipleine is:
* GDRAM2SRAM: io load data form gdram
* SRAM2NRAM : mv load data from sram
* Compute : compute pipleine
* NRAM2SRAM : mv store data to sram
* SRAM2GDRAM : io store data to gdram
*/
*/

#if __BANG_ARCH__ != 520 // TODO(sram): tp_520
#define BINARY_OP_KERNEL_5PIPELINE(Op, Prefer) \
Expand Down
2 changes: 1 addition & 1 deletion kernels/binary_op/binary_op_host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ mluOpStatus_t binaryOpParamCheck(
"output tensor num is too large. ");

if (mluop::strideCaseWithNotConsistentDense(3, input1_desc, input2_desc,
output_desc)) {
output_desc)) {
uint64_t num_input1_with_stride = shapeStrideCount(input1_desc);
uint64_t num_input2_with_stride = shapeStrideCount(input2_desc);
uint64_t num_output_with_stride = shapeStrideCount(output_desc);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,16 @@ mluOpStatus_t mluOpBorderAlignBackward(
PARAM_CHECK(API, argmax_idx_desc->dim == 4);
PARAM_CHECK(API, grad_input_desc->dim == 4);

// stride check
STRIDE_TENSOR_CHECK("[mluOpBorderAlignBackward]:", grad_output_desc,
"grad_output_desc must be contiguous");
STRIDE_TENSOR_CHECK("[mluOpBorderAlignBackward]:", boxes_desc,
"boxes_desc must be contiguous");
STRIDE_TENSOR_CHECK("[mluOpBorderAlignBackward]:", argmax_idx_desc,
"argmax_idx_desc must be contiguous");
STRIDE_TENSOR_CHECK("[mluOpBorderAlignBackward]:", grad_input_desc,
"grad_input_desc must be contiguous");

const int32_t border_num = 4;
const int32_t coord_num = 4;
const int32_t origin_n = grad_input_desc->dims[0];
Expand All @@ -83,14 +93,13 @@ mluOpStatus_t mluOpBorderAlignBackward(
"(4 represents the number of borders).");
PARAM_CHECK_NE(API, grad_input_desc->dims[0], 0);
PARAM_CHECK_NE(API, grad_input_desc->dims[3] / 4, 0,
"(4 represents the number of borders).");
"(4 represents the number of borders).");
PARAM_CHECK_NE(API, grad_input_desc->dims[1], 0);
PARAM_CHECK_NE(API, grad_input_desc->dims[2], 0);
PARAM_CHECK(API, grad_input_desc->dims[1] * grad_input_desc->dims[2]
== boxes_desc->dims[1]);
PARAM_CHECK(API, grad_input_desc->dims[1] * grad_input_desc->dims[2] ==
boxes_desc->dims[1]);
PARAM_CHECK(API, boxes_desc->dim == 3);
PARAM_CHECK(API, boxes_desc->dims[2] == border_num,
"(border_num = 4).");
PARAM_CHECK(API, boxes_desc->dims[2] == border_num, "(border_num = 4).");
PARAM_CHECK_NE(API, boxes_desc->dims[1], 0);
PARAM_CHECK_GT(API, pool_size, 0);

Expand All @@ -103,8 +112,7 @@ mluOpStatus_t mluOpBorderAlignBackward(

PARAM_CHECK_EQ(API, boxes_desc->dims[0], grad_input_desc->dims[0]);
PARAM_CHECK_EQ(API, boxes_desc->dims[1], boxes_desc->dims[1]);
PARAM_CHECK_EQ(API, boxes_desc->dims[2], border_num,
"(border_num = 4).");
PARAM_CHECK_EQ(API, boxes_desc->dims[2], border_num, "(border_num = 4).");

PARAM_CHECK_EQ(API, argmax_idx_desc->dims[0], grad_input_desc->dims[0]);
PARAM_CHECK_EQ(API, argmax_idx_desc->dims[1], boxes_desc->dims[1]);
Expand Down Expand Up @@ -133,8 +141,7 @@ mluOpStatus_t mluOpBorderAlignBackward(
GEN_CASE_DATA_REAL(true, "input2", boxes, boxes_desc);
GEN_CASE_DATA_REAL(true, "input3", argmax_idx, argmax_idx_desc);
GEN_CASE_DATA(false, "output1", grad_input, grad_input_desc, 0, 0);
GEN_CASE_OP_PARAM_SINGLE(0, "border_align", "pool_size",
pool_size);
GEN_CASE_OP_PARAM_SINGLE(0, "border_align", "pool_size", pool_size);
GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0);
}

Expand Down
16 changes: 13 additions & 3 deletions kernels/border_align/border_align_forward/border_align_forward.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,16 @@ mluOpStatus_t mluOpBorderAlignForward(
PARAM_CHECK(API, output_desc->dim == 4);
PARAM_CHECK(API, argmax_idx_desc->dim == 4);

// stride check
STRIDE_TENSOR_CHECK("[mluOpBorderAlignForward]:", input_desc,
"input_desc must be contiguous");
STRIDE_TENSOR_CHECK("[mluOpBorderAlignForward]:", boxes_desc,
"boxes_desc must be contiguous");
STRIDE_TENSOR_CHECK("[mluOpBorderAlignForward]:", output_desc,
"output_desc must be contiguous");
STRIDE_TENSOR_CHECK("[mluOpBorderAlignForward]:", argmax_idx_desc,
"argmax_idx_desc must be contiguous");

const int32_t border_num = 4;
const int32_t coord_num = 4;
const int32_t origin_n = input_desc->dims[0];
Expand Down Expand Up @@ -87,9 +97,9 @@ mluOpStatus_t mluOpBorderAlignForward(
PARAM_CHECK(API, boxes_desc->dim == 3);
PARAM_CHECK(API, boxes_desc->dims[2] == 4);

PARAM_CHECK(API, input_desc->dims[0]== boxes_desc->dims[0]);
PARAM_CHECK(API, input_desc->dims[1] * input_desc->dims[2]
== boxes_desc->dims[1]);
PARAM_CHECK(API, input_desc->dims[0] == boxes_desc->dims[0]);
PARAM_CHECK(API,
input_desc->dims[1] * input_desc->dims[2] == boxes_desc->dims[1]);
PARAM_CHECK_EQ(API, output_desc->dims[0], input_desc->dims[0]);
PARAM_CHECK_EQ(API, output_desc->dims[1], boxes_desc->dims[1]);
PARAM_CHECK_EQ(API, output_desc->dims[2], 4);
Expand Down
9 changes: 9 additions & 0 deletions kernels/box_iou_rotated/box_iou_rotated.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,15 @@ mluOpBoxIouRotated(mluOpHandle_t handle, const int mode, const bool aligned,
return MLUOP_STATUS_BAD_PARAM;
}
}

// stride check
STRIDE_TENSOR_CHECK("[mluOpBoxIouRotated]:", box1_desc,
"box1_desc must be contiguous");
STRIDE_TENSOR_CHECK("[mluOpBoxIouRotated]:", box2_desc,
"box2_desc must be contiguous");
STRIDE_TENSOR_CHECK("[mluOpBoxIouRotated]:", ious_desc,
"ious_desc must be contiguous");

// 0-element check, after dim and shape check
if (box1_desc->dims[0] * box2_desc->dims[0] == 0) {
VLOG(5) << "[mluOpBoxIouRotated] Skip zero element boxes.";
Expand Down
6 changes: 4 additions & 2 deletions kernels/carafe/carafe_block.mlu
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,8 @@ __mlu_global__ void MLUKernelCarafeBackward(T *input, T *mask, T *grad_output,
(T *)nram_buf + 3 * NRAM_BLOCK / sizeof(T),
((T *)nram_buf + NRAM_BLOCK / sizeof(T))[mask_index], num_align);
__bang_atomic_reduce_add((T *)base_grad_input,
(T *)nram_buf + 2 * NRAM_BLOCK / sizeof(T), num_align);
(T *)nram_buf + 2 * NRAM_BLOCK / sizeof(T),
num_align);
__bang_mul((T *)nram_buf, (T *)nram_buf + 3 * NRAM_BLOCK / sizeof(T),
(T *)nram_buf, num_align);

Expand Down Expand Up @@ -411,7 +412,8 @@ __mlu_global__ void MLUKernelCarafeBackward(T *input, T *mask, T *grad_output,
((T *)nram_buf + NRAM_BLOCK / sizeof(T))[mask_index],
rem_for_loop_align);
__bang_atomic_reduce_add((T *)base_grad_input,
(T *)nram_buf + 2 * NRAM_BLOCK / sizeof(T), rem_for_loop);
(T *)nram_buf + 2 * NRAM_BLOCK / sizeof(T),
rem_for_loop);
__bang_mul((T *)nram_buf, (T *)nram_buf + 3 * NRAM_BLOCK / sizeof(T),
(T *)nram_buf, rem_for_loop_align);

Expand Down
Loading

0 comments on commit 240c7c3

Please sign in to comment.