[Feature](bangc-ops): add concat binary operator.

Cambricon · Oct 19, 2023 · 02d02fd · 02d02fd
1 parent 88c2fa7
commit 02d02fd
Show file tree

Hide file tree

Showing 10 changed files with 665 additions and 201 deletions.
diff --git a/bangc-ops/kernels/concat/concat.cpp b/bangc-ops/kernels/concat/concat.cpp
@@ -0,0 +1,41 @@
+/*************************************************************************
+* Copyright (C) [2023] by Cambricon, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the
+* "Software"), to deal in the Software without restriction, including
+* without limitation the rights to use, copy, modify, merge, publish,
+* distribute, sublicense, and/or sell copies of the Software, and to
+* permit persons to whom the Software is furnished to do so, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*************************************************************************/
+#include "kernels/kernel_wrapper/wrapper.h"
+
+mluOpStatus_t MLUOP_WIN_API mluOpConcat(
+    mluOpHandle_t handle,
+    const int concat_num,
+    const int axis,
+    const mluOpTensorDescriptor_t inputs_desc[],
+    const void *const inputs[],
+    void *workspace,
+    size_t workspace_size,
+    const mluOpTensorDescriptor_t output_desc,
+    void *output) {
+  ConcatWrapper wrapper;
+  mluOpStatus_t ret = wrapper.invoke(handle, concat_num, axis, inputs_desc,
+                                     inputs, workspace, workspace_size,
+                                     output_desc, output);
+  return ret;
+}
+
diff --git a/bangc-ops/kernels/kernel_wrapper/lib/libextops.a b/bangc-ops/kernels/kernel_wrapper/lib/libextops.a
diff --git a/bangc-ops/kernels/kernel_wrapper/wrapper.h b/bangc-ops/kernels/kernel_wrapper/wrapper.h
diff --git a/bangc-ops/mlu_op.h b/bangc-ops/mlu_op.h
@@ -15293,6 +15293,156 @@ mluOpStridedSlice(mluOpHandle_t handle,
                   const mluOpTensorDescriptor_t output_desc,
                   void *output);
 
+// Group:Concat
+/*!
+ * @brief Concatenates the list of input tensors \b inputs along the given dimension \b axis.
+ *
+ * @param[in] handle
+ * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in
+ * ::mluOpConcat operation. For detailed information, see ::mluOpHandle_t.
+ * @param[in] concat_num
+ * Number of tensors needed to be concatenated.
+ * @param[in] axis
+ * Dimension along which to be concatenated. The value must be in the range of [-rank, rank),
+ * where rank is the number of dimensions in the input tensors,
+ * and negative \b axis refers to ``axis + rank``.
+ * @param[in] inputs_desc
+ * The list of descriptors of input tensors. For detailed information,
+ * see ::mluOpTensorDescriptor_t.
+ * @param[in] inputs
+ * A host pointer to a list of MLU pointers, which point to the MLU memory that store the
+ * input tensors.
+ * @param[in] workspace
+ * Pointer to the MLU memory that is used as an extra workspace for the concat operation.
+ * For more information about workspace, see "Cambricon BANG C OPS User Guide". Because ::mluOpConcat
+ * does not need extra workspace, the \b workspace can be set to NULL.
+ * @param[in] workspace_size
+ * The size of the extra workspace in bytes that needs to be used in the concat operation.
+ * You can get the size of the workspace with the ::mluOpGetConcatWorkspaceSize function.
+ * Because ::mluOpConcat does not need extra workspace, the \b workspace_size can be set to 0.
+ * @param[in] output_desc
+ * The descriptor of the output tensor. For detailed information,
+ * see ::mluOpTensorDescriptor_t.
+ * @param[out] output
+ * Pointer to the MLU memory that stores the output tensor.
+ *
+ * @par Return
+ * -  ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM, ::MLUOP_STATUS_ALLOC_FAILED
+ *
+ * @par Formula
+ * - See "Concat Operator" section in "Cambricon MLUOP User Guide" for details.
+ *
+ * @par Data Type
+ * - The (I/O)function supports the following byte-width data types for \b input and \b output tensors.
+ *   The byte width of a data type can be obtained by ::mluOpGetSizeOfDataType function.
+ *   <b>Note that all the tensors must have the same data type. If the tensors are in
+ *      fixed-point data type, the quantization parameters of all the tensors should be the same.
+ *   </b>
+ * - The supported byte-width data types are as follows:
+ *   - input tensor: 1-byte, 2-byte, 4-byte, 8-byte.
+ *   - output tensor: 1-byte, 2-byte, 4-byte, 8-byte.
+ *
+ * @par Data Layout
+ * - None.
+ *
+ * @par Scale Limitation
+ * - The parameters must meet the following requirements:
+ *   - The parameter \b concat_num should be greater than 0.
+ *   - The number of dimensions of all tensors must match, including inputs and output.
+ *   - All dimensions except \b axis must be equal, and the dimension of output on \b axis must be
+ *     equal to the sum of input dimensions on \b axis.
+ *
+ * @par API Dependency
+ * - Before calling this function to implement concat, you need to call
+ *   ::mluOpGetConcatWorkspaceSize to get the extra space size needed in concat operation.
+ *
+ * @par Note
+ * - None.
+ *
+ * @par Requirements
+ * - None.
+ *
+ * @par Example
+ * - The example of concat operation is as follows:
+ *   @verbatim
+     input: 3 tensors with the shapes of 2 * 3, 2 * 3 and 1 * 3, respectively
+             --> [[1,2,3],[4,5,6]]
+             --> [[7,8,9],[10,11,12]]
+             --> [[13,14,15]]
+
+     concat_num: 3
+
+     axis: 0
+
+     Then we will get the output:
+
+     output: a tensor of 5 * 3 --> [[1,2,3],[4,5,6],[7,8,9],[10,11,12],[13,14,15]]
+     @endverbatim
+ *
+ * @par Reference
+ * - http://www.tensorflow.org/api_docs/python/tf/concat
+ */
+mluOpStatus_t MLUOP_WIN_API
+mluOpConcat(mluOpHandle_t handle,
+            const int concat_num,
+            const int axis,
+            const mluOpTensorDescriptor_t inputs_desc[],
+            const void *const inputs[],
+            void *workspace,
+            size_t workspace_size,
+            const mluOpTensorDescriptor_t output_desc,
+            void *output);
+
+// Group:Concat
+/*!
+ * @brief Returns in \b size the size of the MLU memory that is used as an extra workspace to
+ * optimize the concat operation.
+ *
+ * @param[in] handle
+ * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in
+ * ::mluOpConcat operation. For detailed information, see ::mluOpHandle_t.
+ * @param[in] concat_num
+ * Number of tensors needed to be concatenated.
+ * @param[out] size
+ * A host pointer to the returned size of the extra workspace in bytes that is used in the
+ * ::mluOpConcat operation. At present, because ::mluOpConcat does not need extra workspace,
+ * the \b size will be returned with 0.
+ *
+ * @par Return
+ * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM
+ *
+ * @par Formula
+ * - None.
+ *
+ * @par Data Type
+ * - None.
+ *
+ * @par Data Layout
+ * - None.
+ *
+ * @par Scale Limitation
+ * - The parameters must meet the following requirements:
+ *   - The parameter \b concat_num should be greater than 0.
+ *
+ * @par API Dependency
+ * - The allocated extra workspace should be passed to the ::mluOpConcat function to perform the
+ *   ::mluOpConcat operation.
+ *
+ * @Note
+ * - None.
+ *
+ * @par Requirements
+ * - None.
+ *
+ * @par Example
+ * - None.
+ *
+ * @par Reference
+ * - None.
+ */
+mluOpStatus_t MLUOP_WIN_API
+mluOpGetConcatWorkspaceSize(mluOpHandle_t handle, const int concat_num, size_t *size);
+
 #if defined(__cplusplus)
 }
 #endif

diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto b/bangc-ops/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/concat/concat.cpp b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/concat/concat.cpp
@@ -0,0 +1,120 @@
+/*************************************************************************
+ * Copyright (C) [2023] by Cambricon, Inc.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "concat.h"
+
+#include <memory>
+
+namespace mluoptest {
+
+void ConcatExecutor::paramCheck() {
+  assert(parser_->getInputNum() > 0);
+  assert(parser_->getOutputNum() == 1);
+  if (!parser_->getProtoNode()->has_concat_param()) {
+    LOG(ERROR) << "Lose concat param. ";
+  }
+}
+
+void ConcatExecutor::workspaceMalloc() {
+  input_num_ = parser_->getInputNum();
+  MLUOP_CHECK(
+      mluOpGetConcatWorkspaceSize(handle_, input_num_, &workspace_size_));
+  VLOG(4) << "Malloc workspace space.";
+  void *temp = mlu_runtime_.allocate(workspace_size_);
+  workspace_.push_back(temp);
+  VLOG(4) << "Malloc addr: " << temp << " , size: " << workspace_size_;
+
+  eva_->setMluWorkspaceSize(workspace_size_);
+}
+
+void ConcatExecutor::compute() {
+  VLOG(4) << "ConcatExecutor compute ";
+  if (!parser_->getProtoNode()->has_concat_param()) {
+    LOG(ERROR) << "Lose concat param. ";
+  }
+  axis_ = parser_->getProtoNode()->concat_param().axis();
+
+  std::vector<void *> pdev_input_h(input_num_);
+  for (int i = 0; i < input_num_; i++) {
+    pdev_input_h[i] = data_vector_[i].device_ptr;
+  }
+
+  mluOpTensorDescriptor_t *in_desc =
+      cpu_runtime_.allocate(new mluOpTensorDescriptor_t[input_num_]);
+  for (int i = 0; i < input_num_; i++) {
+    in_desc[i] = tensor_desc_[i].tensor;
+  }
+  auto out_desc = tensor_desc_[input_num_].tensor;
+
+  VLOG(4) << "call mluOpconcatTensor()";
+  interface_timer_.start();
+  MLUOP_CHECK(mluOpConcat(handle_, input_num_, axis_, in_desc,
+                          pdev_input_h.data(), workspace_[0], workspace_size_,
+                          out_desc, data_vector_[input_num_].device_ptr));
+  interface_timer_.stop();
+
+  if (in_desc) {
+    cpu_runtime_.deallocate(in_desc);
+    in_desc = nullptr;
+  }
+}
+
+void ConcatExecutor::workspaceFree() {
+  VLOG(4) << "Free device workspace space.";
+  if (workspace_[0] != nullptr) {
+    mlu_runtime_.deallocate(workspace_[0]);
+  }
+}
+
+void ConcatExecutor::cpuConcat(std::vector<TensorPair> input_desc,
+                               std::vector<float *> input, int input_num,
+                               int axis_t, float *output) {
+  int dim_num = input_desc[0].tensor->dim;
+  size_t axis = axis_t < 0 ? axis_t + dim_num : axis_t;
+  size_t high_size = 1;
+  for (size_t i = 0; i < axis; i++) {
+    high_size *= input_desc[0].tensor->dims[i];
+  }
+  size_t low_low_size = 1;
+  for (size_t i = dim_num - 1; i > axis; i--) {
+    low_low_size *= input_desc[0].tensor->dims[i];
+  }
+  size_t *low_sizes = cpu_runtime_.allocate(new size_t[input_num]);
+  for (size_t i = 0; i < input_num; i++) {
+    low_sizes[i] = input_desc[i].tensor->dims[axis] * low_low_size;
+  }
+
+  size_t offset = 0;
+  for (size_t j = 0; j < high_size; j++) {
+    for (size_t i = 0; i < input_num; i++) {
+      memcpy(output + offset, input[i] + j * low_sizes[i],
+             low_sizes[i] * sizeof(float));
+      offset += low_sizes[i];
+    }
+  }
+  cpu_runtime_.deallocate(low_sizes);
+}
+
+void ConcatExecutor::cpuCompute() {
+  assert(parser_->getInputNum() > 0);
+  assert(parser_->getOutputNum() == 1);
+
+  cpuConcat(tensor_desc_, cpu_fp32_input_, input_num_, axis_,
+            cpu_fp32_output_[0]);
+}
+
+int64_t ConcatExecutor::getTheoryOps() {
+  int64_t theory_ops = parser_->getOutputDataCount(0);
+  VLOG(4) << "getTheoryOps: " << theory_ops << " ops";
+  return theory_ops;
+}
+
+}  // namespace mluoptest
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/concat/concat.h b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/concat/concat.h
@@ -0,0 +1,40 @@
+/*************************************************************************
+ * Copyright (C) [2023] by Cambricon, Inc.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef TEST_MLU_OP_GTEST_SRC_ZOO_CONCAT_CONCAT_H_
+#define TEST_MLU_OP_GTEST_SRC_ZOO_CONCAT_CONCAT_H_
+#include <vector>
+#include "executor.h"
+
+namespace mluoptest {
+
+class ConcatExecutor : public Executor {
+ public:
+  ConcatExecutor() {}
+  ~ConcatExecutor() {}
+
+  void paramCheck();
+  void workspaceMalloc();
+  void compute();
+  void workspaceFree();
+  void cpuCompute();
+  void cpuConcat(std::vector<TensorPair> input_desc, std::vector<float *> input,
+                 int input_num, int axis_t, float *output);
+  int64_t getTheoryOps() override;
+
+ private:
+  int axis_;
+  int input_num_;
+  size_t workspace_size_;
+};
+
+}  // namespace mluoptest
+#endif  // TEST_MLU_OP_GTEST_SRC_ZOO_CONCAT_CONCAT_H_