[Feature](mlu-ops): add stride tensor check (#1066)

Co-authored-by: fuzhouxiang <[email protected]>
Cambricon · Aug 5, 2024 · 240c7c3 · 240c7c3
1 parent 2cd5bc6
commit 240c7c3
Show file tree

Hide file tree

Showing 73 changed files with 946 additions and 366 deletions.
diff --git a/core/logging.h b/core/logging.h
@@ -28,8 +28,10 @@
 #include <string>
 #include <limits>
 #include <sstream>
-#include "core/macros.h"
+
 #include "core/cnlog.hpp"
+#include "core/macros.h"
+#include "core/util.h"
 #include "mlu_op.h"
 
 #define LARGE_TENSOR_NUM ((uint64_t)2147483648)
@@ -216,9 +218,17 @@ extern bool mluop_check_large_tensor_dim_size_;
     return MLUOP_STATUS_NOT_SUPPORTED;                                     \
   }
 
-void mluOpCheck(mluOpStatus_t result, char const *const func,
-                const char *const file, int const line);
-#define MLUOP_CHECK(val) mluOpCheck((val), #val, __FILE__, __LINE__)
+#define STRIDE_TENSOR_CHECK(api, desc, reason)                            \
+  if (MLUOP_PREDICT_TRUE(desc != NULL)) {                                 \
+    if (MLUOP_PREDICT_FALSE(                                              \
+            MLUOP_PREDICT_TRUE(0 != mluOpGetTensorElementNum(desc)) &&    \
+            isStrideTensor(desc->dim, desc->dims, desc->strides))) {      \
+      LOG(ERROR) << api << " stride tensor is not supported. " << reason; \
+      return MLUOP_STATUS_NOT_SUPPORTED;                                  \
+    }                                                                     \
+  }
+
+#define MLUOP_CHECK(val) ((val), #val, __FILE__, __LINE__)
 
 #define KERNEL_CALL_CHECK(parent_kernel, sub_kernel, status, statement)        \
   do {                                                                         \
@@ -274,9 +284,11 @@ struct Voidifier {
     return vmodule_activated;                                          \
   })(lvl, __FILE__))
 
-#define VLOG(level)                      \
-  MLUOP_PREDICT_TRUE(!VLOG_IS_ON(({ static_assert \
-  (level > 0, "VLOG level should be greater than 0"); level; }))) \
+#define VLOG(level)                                                  \
+  MLUOP_PREDICT_TRUE(!VLOG_IS_ON(({                                  \
+    static_assert(level > 0, "VLOG level should be greater than 0"); \
+    level;                                                           \
+  })))                                                               \
   ? (void)0 : ::mluop::internal::Voidifier() & LOG(VLOG)
 
 // This formats a value for a failing CHECK_XX statement.  Ordinarily,

diff --git a/core/util.cpp b/core/util.cpp
@@ -22,9 +22,10 @@
  *************************************************************************/
 
 #include <string>
-#include <stdexcept>
-#include "mlu_op.h"
+
 #include "core/logging.h"
+#include "core/util.h"
+#include "mlu_op.h"
 
 void mluOpCheck(mluOpStatus_t result, char const *const func,
                 const char *const file, int const line) {
@@ -38,3 +39,18 @@ void mluOpCheck(mluOpStatus_t result, char const *const func,
     throw std::runtime_error(error);
   }
 }
+
+bool isStrideTensor(const int dim, const int64_t *dims,
+                    const int64_t *strides) {
+  int64_t stride_base = 1;
+
+  for (int i = dim - 1; i >= 0; i--) {
+    if (dims[i] != 1 && strides[i] != stride_base) {
+      return true;
+    }
+
+    stride_base *= dims[i];
+  }
+
+  return false;
+}
diff --git a/core/util.h b/core/util.h
@@ -0,0 +1,34 @@
+/*************************************************************************
+ * Copyright (C) [2022] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+
+#ifndef CORE_UTIL_H_
+#define CORE_UTIL_H_
+
+#include "mlu_op.h"
+
+void mluOpCheck(mluOpStatus_t result, char const *const func,
+                const char *const file, int const line);
+
+bool isStrideTensor(const int dim, const int64_t *dims, const int64_t *strides);
+
+#endif  // CORE_UTIL_H_
diff --git a/kernels/active_rotated_filter/active_rotated_filter.cpp b/kernels/active_rotated_filter/active_rotated_filter.cpp
@@ -85,6 +85,14 @@ static mluOpStatus_t activeRotatedFilterForwardParamCheck(
   PARAM_CHECK(api_name, (output_desc->dims[1] ==
                          input_desc->dims[1] * input_desc->dims[2]));
 
+  // check stride
+  STRIDE_TENSOR_CHECK(api_name + ":", input_desc,
+                      "input_desc must be contiguous");
+  STRIDE_TENSOR_CHECK(api_name + ":", indices_desc,
+                      "indices_desc must be contiguous");
+  STRIDE_TENSOR_CHECK(api_name + ":", output_desc,
+                      "output_desc must be contiguous");
+
   // check tensor datatype, support float16 and float32
   PARAM_CHECK_V2(api_name,
                  (input_desc->dtype == MLUOP_DTYPE_HALF) ||

diff --git a/kernels/adam_w/adam_w.cpp b/kernels/adam_w/adam_w.cpp
@@ -182,6 +182,22 @@ mluOpAdamW(mluOpHandle_t handle, const mluOpAdamWDescriptor_t adamw_desc,
   PARAM_CHECK("[mluOpAdamW]", velocity != nullptr);
   PARAM_CHECK("[mluOpAdamW]", grad != nullptr);
 
+  // stride check
+  if (param_desc != nullptr) {
+    STRIDE_TENSOR_CHECK("[mluOpAdamW]:", param_desc,
+                        "param_desc must be contiguous");
+  }
+  if (paramh_desc != nullptr) {
+    STRIDE_TENSOR_CHECK("[mluOpAdamW]:", paramh_desc,
+                        "paramh_desc must be contiguous");
+  }
+  STRIDE_TENSOR_CHECK("[mluOpAdamW]:", momentum_desc,
+                      "momentum_desc must be contiguous");
+  STRIDE_TENSOR_CHECK("[mluOpAdamW]:", velocity_desc,
+                      "velocity_desc must be contiguous");
+  STRIDE_TENSOR_CHECK("[mluOpAdamW]:", grad_desc,
+                      "grad_desc must be contiguous");
+
   // generate adam prototxt start!
   if (MLUOP_GEN_CASE_ON_NEW) {
     GEN_CASE_START("adamw", "ADAMW");
@@ -246,17 +262,16 @@ mluOpAdamW(mluOpHandle_t handle, const mluOpAdamWDescriptor_t adamw_desc,
       return MLUOP_STATUS_ARCH_MISMATCH;
     }
     case CNRT_FUNC_TYPE_UNION1: {
-      VLOG(5) << "Launch Kernel KernelApplyAdamW<<<Union"
-              << k_type / CORE_DIM << ", " << k_dim.x << ", " << k_dim.y << ", "
-              << k_dim.z << ">>>";
+      VLOG(5) << "Launch Kernel KernelApplyAdamW<<<Union" << k_type / CORE_DIM
+              << ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
       CHECK_RETURN(
           "[mluOpAdamW]",
-          KernelApplyAdamW(
-              k_dim, k_type, handle->queue, (void *)param, (void *)param_h,
-              (void *)grad, (void *)momentum, (void *)velocity, lr, beta1,
-              beta2, bias1, bias2, epsilon, adamw_desc->weight_decay,
-              adamw_desc->grad_scale, adamw_desc->use_nesterov, size,
-              k_data_type));
+          KernelApplyAdamW(k_dim, k_type, handle->queue, (void *)param,
+                           (void *)param_h, (void *)grad, (void *)momentum,
+                           (void *)velocity, lr, beta1, beta2, bias1, bias2,
+                           epsilon, adamw_desc->weight_decay,
+                           adamw_desc->grad_scale, adamw_desc->use_nesterov,
+                           size, k_data_type));
     }
   }
   GEN_CASE_END();

diff --git a/kernels/ball_query/ball_query.cpp b/kernels/ball_query/ball_query.cpp
@@ -104,6 +104,14 @@ mluOpStatus_t MLUOP_WIN_API mluOpBallQuery(
   PARAM_CHECK("[mluOpBallQuery]", xyz_desc->dims[2] == 3);
   PARAM_CHECK("[mluOpBallQuery]", idx_desc->dims[2] == nsample);
 
+  // check stride
+  STRIDE_TENSOR_CHECK("[mluOpBallQuery]:", new_xyz_desc,
+                      "new_xyz_desc must be contiguous");
+  STRIDE_TENSOR_CHECK("[mluOpBallQuery]:", xyz_desc,
+                      "xyz_desc must be contiguous");
+  STRIDE_TENSOR_CHECK("[mluOpBallQuery]:", idx_desc,
+                      "idx_desc must be contiguous");
+
   // check dtype
   if (!isSupportType(new_xyz_desc->dtype, support_type, 2)) {
     LOG(ERROR) << "[mluOpBallQuery]:Only half and float are supported in input "

diff --git a/kernels/bbox_overlaps/bbox_overlaps.cpp b/kernels/bbox_overlaps/bbox_overlaps.cpp
@@ -70,6 +70,11 @@ mluOpStatus_t MLUOP_WIN_API mluOpBboxOverlaps(
   PARAM_CHECK(API, bbox1_desc->dim == 2);
   PARAM_CHECK(API, bbox2_desc->dim == 2);
 
+  // stride check
+  STRIDE_TENSOR_CHECK(API + ":", bbox1_desc, "bbox1_desc must be contiguous");
+  STRIDE_TENSOR_CHECK(API + ":", bbox2_desc, "bbox2_desc must be contiguous");
+  STRIDE_TENSOR_CHECK(API + ":", ious_desc, "ious_desc must be contiguous");
+
   // param check
   if (mode != 1 && mode != 0) {
     LOG(ERROR) << "[mluOpBboxOverlaps] Check failed: The mode must be 0 or 1, "

diff --git a/kernels/binary_op/binary_op_3pipeline.h b/kernels/binary_op/binary_op_3pipeline.h
@@ -160,7 +160,7 @@
                      rem * sizeof(DType_out), NRAM2GDRAM);                    \
       pvUnlock();                                                             \
     }                                                                         \
-}
+  }
 
 // Divide tasks in host
 #define BINARY_OP_KERNEL_3PIPELINE_V2_DECLARE(Op, Prefer)               \

diff --git a/kernels/binary_op/binary_op_5pipeline.h b/kernels/binary_op/binary_op_5pipeline.h
@@ -42,17 +42,17 @@
 
 /****************************************************************************
  * GDRAM2SRAM: io pipeline
- * SRAM2NRAM ：mo pipeline 
- * In Cambricon hardware, The io, compute, and move pipeline have their own 
- * instruction queues and can be launched in parallel, 
+ * SRAM2NRAM ：mo pipeline
+ * In Cambricon hardware, The io, compute, and move pipeline have their own
+ * instruction queues and can be launched in parallel,
  * so the time of io, compute and move can cover each other.
  * The five pipleine is:
  * GDRAM2SRAM: io load data form gdram
  * SRAM2NRAM : mv load data from sram
  * Compute : compute pipleine
  * NRAM2SRAM : mv store data to sram
  * SRAM2GDRAM ： io store data to gdram
-*/ 
+ */
 
 #if __BANG_ARCH__ != 520  // TODO(sram): tp_520
 #define BINARY_OP_KERNEL_5PIPELINE(Op, Prefer)                                 \

diff --git a/kernels/binary_op/binary_op_host.cpp b/kernels/binary_op/binary_op_host.cpp
@@ -218,7 +218,7 @@ mluOpStatus_t binaryOpParamCheck(
                      "output tensor num is too large. ");
 
     if (mluop::strideCaseWithNotConsistentDense(3, input1_desc, input2_desc,
-                                         output_desc)) {
+                                                output_desc)) {
       uint64_t num_input1_with_stride = shapeStrideCount(input1_desc);
       uint64_t num_input2_with_stride = shapeStrideCount(input2_desc);
       uint64_t num_output_with_stride = shapeStrideCount(output_desc);

diff --git a/kernels/border_align/border_align_backward/border_align_backward.cpp b/kernels/border_align/border_align_backward/border_align_backward.cpp
@@ -61,6 +61,16 @@ mluOpStatus_t mluOpBorderAlignBackward(
   PARAM_CHECK(API, argmax_idx_desc->dim == 4);
   PARAM_CHECK(API, grad_input_desc->dim == 4);
 
+  // stride check
+  STRIDE_TENSOR_CHECK("[mluOpBorderAlignBackward]:", grad_output_desc,
+                      "grad_output_desc must be contiguous");
+  STRIDE_TENSOR_CHECK("[mluOpBorderAlignBackward]:", boxes_desc,
+                      "boxes_desc must be contiguous");
+  STRIDE_TENSOR_CHECK("[mluOpBorderAlignBackward]:", argmax_idx_desc,
+                      "argmax_idx_desc must be contiguous");
+  STRIDE_TENSOR_CHECK("[mluOpBorderAlignBackward]:", grad_input_desc,
+                      "grad_input_desc must be contiguous");
+
   const int32_t border_num = 4;
   const int32_t coord_num = 4;
   const int32_t origin_n = grad_input_desc->dims[0];
@@ -83,14 +93,13 @@ mluOpStatus_t mluOpBorderAlignBackward(
               "(4 represents the number of borders).");
   PARAM_CHECK_NE(API, grad_input_desc->dims[0], 0);
   PARAM_CHECK_NE(API, grad_input_desc->dims[3] / 4, 0,
-              "(4 represents the number of borders).");
+                 "(4 represents the number of borders).");
   PARAM_CHECK_NE(API, grad_input_desc->dims[1], 0);
   PARAM_CHECK_NE(API, grad_input_desc->dims[2], 0);
-  PARAM_CHECK(API, grad_input_desc->dims[1] * grad_input_desc->dims[2]
-                    == boxes_desc->dims[1]);
+  PARAM_CHECK(API, grad_input_desc->dims[1] * grad_input_desc->dims[2] ==
+                       boxes_desc->dims[1]);
   PARAM_CHECK(API, boxes_desc->dim == 3);
-  PARAM_CHECK(API, boxes_desc->dims[2] == border_num,
-              "(border_num = 4).");
+  PARAM_CHECK(API, boxes_desc->dims[2] == border_num, "(border_num = 4).");
   PARAM_CHECK_NE(API, boxes_desc->dims[1], 0);
   PARAM_CHECK_GT(API, pool_size, 0);
 
@@ -103,8 +112,7 @@ mluOpStatus_t mluOpBorderAlignBackward(
 
   PARAM_CHECK_EQ(API, boxes_desc->dims[0], grad_input_desc->dims[0]);
   PARAM_CHECK_EQ(API, boxes_desc->dims[1], boxes_desc->dims[1]);
-  PARAM_CHECK_EQ(API, boxes_desc->dims[2], border_num,
-                 "(border_num = 4).");
+  PARAM_CHECK_EQ(API, boxes_desc->dims[2], border_num, "(border_num = 4).");
 
   PARAM_CHECK_EQ(API, argmax_idx_desc->dims[0], grad_input_desc->dims[0]);
   PARAM_CHECK_EQ(API, argmax_idx_desc->dims[1], boxes_desc->dims[1]);
@@ -133,8 +141,7 @@ mluOpStatus_t mluOpBorderAlignBackward(
     GEN_CASE_DATA_REAL(true, "input2", boxes, boxes_desc);
     GEN_CASE_DATA_REAL(true, "input3", argmax_idx, argmax_idx_desc);
     GEN_CASE_DATA(false, "output1", grad_input, grad_input_desc, 0, 0);
-    GEN_CASE_OP_PARAM_SINGLE(0, "border_align", "pool_size",
-                             pool_size);
+    GEN_CASE_OP_PARAM_SINGLE(0, "border_align", "pool_size", pool_size);
     GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0);
   }
 

diff --git a/kernels/border_align/border_align_forward/border_align_forward.cpp b/kernels/border_align/border_align_forward/border_align_forward.cpp
@@ -58,6 +58,16 @@ mluOpStatus_t mluOpBorderAlignForward(
   PARAM_CHECK(API, output_desc->dim == 4);
   PARAM_CHECK(API, argmax_idx_desc->dim == 4);
 
+  // stride check
+  STRIDE_TENSOR_CHECK("[mluOpBorderAlignForward]:", input_desc,
+                      "input_desc must be contiguous");
+  STRIDE_TENSOR_CHECK("[mluOpBorderAlignForward]:", boxes_desc,
+                      "boxes_desc must be contiguous");
+  STRIDE_TENSOR_CHECK("[mluOpBorderAlignForward]:", output_desc,
+                      "output_desc must be contiguous");
+  STRIDE_TENSOR_CHECK("[mluOpBorderAlignForward]:", argmax_idx_desc,
+                      "argmax_idx_desc must be contiguous");
+
   const int32_t border_num = 4;
   const int32_t coord_num = 4;
   const int32_t origin_n = input_desc->dims[0];
@@ -87,9 +97,9 @@ mluOpStatus_t mluOpBorderAlignForward(
   PARAM_CHECK(API, boxes_desc->dim == 3);
   PARAM_CHECK(API, boxes_desc->dims[2] == 4);
 
-  PARAM_CHECK(API, input_desc->dims[0]== boxes_desc->dims[0]);
-  PARAM_CHECK(API, input_desc->dims[1] * input_desc->dims[2]
-                    == boxes_desc->dims[1]);
+  PARAM_CHECK(API, input_desc->dims[0] == boxes_desc->dims[0]);
+  PARAM_CHECK(API,
+              input_desc->dims[1] * input_desc->dims[2] == boxes_desc->dims[1]);
   PARAM_CHECK_EQ(API, output_desc->dims[0], input_desc->dims[0]);
   PARAM_CHECK_EQ(API, output_desc->dims[1], boxes_desc->dims[1]);
   PARAM_CHECK_EQ(API, output_desc->dims[2], 4);

diff --git a/kernels/box_iou_rotated/box_iou_rotated.cpp b/kernels/box_iou_rotated/box_iou_rotated.cpp
@@ -172,6 +172,15 @@ mluOpBoxIouRotated(mluOpHandle_t handle, const int mode, const bool aligned,
       return MLUOP_STATUS_BAD_PARAM;
     }
   }
+
+  // stride check
+  STRIDE_TENSOR_CHECK("[mluOpBoxIouRotated]:", box1_desc,
+                      "box1_desc must be contiguous");
+  STRIDE_TENSOR_CHECK("[mluOpBoxIouRotated]:", box2_desc,
+                      "box2_desc must be contiguous");
+  STRIDE_TENSOR_CHECK("[mluOpBoxIouRotated]:", ious_desc,
+                      "ious_desc must be contiguous");
+
   // 0-element check, after dim and shape check
   if (box1_desc->dims[0] * box2_desc->dims[0] == 0) {
     VLOG(5) << "[mluOpBoxIouRotated] Skip zero element boxes.";

diff --git a/kernels/carafe/carafe_block.mlu b/kernels/carafe/carafe_block.mlu
@@ -368,7 +368,8 @@ __mlu_global__ void MLUKernelCarafeBackward(T *input, T *mask, T *grad_output,
               (T *)nram_buf + 3 * NRAM_BLOCK / sizeof(T),
               ((T *)nram_buf + NRAM_BLOCK / sizeof(T))[mask_index], num_align);
           __bang_atomic_reduce_add((T *)base_grad_input,
-              (T *)nram_buf + 2 * NRAM_BLOCK / sizeof(T), num_align);
+                                   (T *)nram_buf + 2 * NRAM_BLOCK / sizeof(T),
+                                   num_align);
           __bang_mul((T *)nram_buf, (T *)nram_buf + 3 * NRAM_BLOCK / sizeof(T),
                      (T *)nram_buf, num_align);
 
@@ -411,7 +412,8 @@ __mlu_global__ void MLUKernelCarafeBackward(T *input, T *mask, T *grad_output,
               ((T *)nram_buf + NRAM_BLOCK / sizeof(T))[mask_index],
               rem_for_loop_align);
           __bang_atomic_reduce_add((T *)base_grad_input,
-              (T *)nram_buf + 2 * NRAM_BLOCK / sizeof(T), rem_for_loop);
+                                   (T *)nram_buf + 2 * NRAM_BLOCK / sizeof(T),
+                                   rem_for_loop);
           __bang_mul((T *)nram_buf, (T *)nram_buf + 3 * NRAM_BLOCK / sizeof(T),
                      (T *)nram_buf, rem_for_loop_align);