From 8478a9816ecd2553c37c45f1d55babf410feb9ca Mon Sep 17 00:00:00 2001
From: nth-BYTE <160582271+nth-BYTE@users.noreply.github.com>
Date: Wed, 25 Sep 2024 16:50:59 +0800
Subject: [PATCH] [Feature](mlu-ops): optimization for descritpor (#1083)

Co-authored-by: tonghengwen <tonghengwen@cambricon.com>
---
 core/logging.h                                |   2 +-
 core/tensor.cpp                               | 256 +++++++++---------
 core/tensor.h                                 | 214 ++++++---------
 core/type.cpp                                 |  32 ---
 core/type.h                                   |  35 ++-
 kernels/ball_query/ball_query.cpp             |   6 +-
 .../normal_get_indice_pairs.cpp               |  20 +-
 7 files changed, 265 insertions(+), 300 deletions(-)
diff --git a/core/logging.h b/core/logging.h
index f924fcedf..25bfd85e8 100644
--- a/core/logging.h
+++ b/core/logging.h
@@ -122,7 +122,7 @@
   }
 
 #define PARAM_CHECK(api, condition, ...)                                 \
-  if (!(condition)) {                                                    \
+  if MLUOP_PREDICT_FALSE (!(condition)) {                                \
     LOG(ERROR) << api << " Check failed: " #condition ". " #__VA_ARGS__; \
     return MLUOP_STATUS_BAD_PARAM;                                       \
   }
diff --git a/core/tensor.cpp b/core/tensor.cpp
index 5f0405146..cd9602fb2 100644
--- a/core/tensor.cpp
+++ b/core/tensor.cpp
@@ -22,7 +22,7 @@
  *************************************************************************/
 #include <iomanip>
 #include <algorithm>
-
+#include <deque>
 #include "core/tensor.h"
 #include "core/logging.h"
 #include "core/type.h"
@@ -294,41 +294,62 @@ mluOpDestroySeqDataDescriptor(mluOpSeqDataDescriptor_t seq_data_desc) {
   return MLUOP_STATUS_SUCCESS;
 }
 
-#if MLUOP_TENSOR_QUEUE_ENABLE
-static mluOpTensorDescriptorQueueStruct *queue_array = NULL;
-static std::hash<std::thread::id> hasher;
+namespace {
+
+#define MLUOP_TENSOR_QUEUE_ENABLE 1
 
-MLUOP_ATTRIBUTE_CONSTRUCTOR MLUOP_ATTRIBUTE_VISIBILITY_HIDDEN void mluOpInit() {
-  if (!queue_array) {
-    queue_array =
-        new (std::nothrow) mluOpTensorDescriptorQueueStruct[QUEUE_ARRAY_LENGTH];
+#if MLUOP_TENSOR_QUEUE_ENABLE
+struct mluOpTensorDescriptorQueueStruct {
+  mluOpTensorDescriptorQueueStruct() {
+    extend(extend_num);
+    extend_num *= 2;
   }
-}
+  explicit mluOpTensorDescriptorQueueStruct(size_t n) {
+    extend_num = n;
+    extend(extend_num);
+    extend_num *= 2;
+  }
+
+  // Let the OS do the cleanup since it's a global variable
+  ~mluOpTensorDescriptorQueueStruct() {}
 
-MLUOP_ATTRIBUTE_DESTRUCTOR MLUOP_ATTRIBUTE_VISIBILITY_HIDDEN void mluOpExit() {
-  if (queue_array) {
-    delete[] queue_array;
-    queue_array = NULL;
+  inline void lock() {
+    while (flag.test_and_set(std::memory_order_acquire)) {
+    }
   }
-}
+  inline void unlock() { flag.clear(std::memory_order_release); }
+  inline void extend(size_t n) {
+    mluOpTensorStruct *header = new (std::nothrow) mluOpTensorStruct[n];
+    for (size_t i = 0; i < n; ++i) {
+      mluOpTensorStruct *desc = header + i;
+      queue.push_front(desc);
+    }
+  }
+  size_t extend_num = 128;
+  std::deque<mluOpTensorDescriptor_t> queue;
+  std::atomic_flag flag = ATOMIC_FLAG_INIT;
+};
+
+static mluOpTensorDescriptorQueueStruct queue_array;
 #endif
+}  // anonymous namespace
+
 /* MLUOP interface */
 mluOpStatus_t MLUOP_WIN_API
 mluOpCreateTensorDescriptor(mluOpTensorDescriptor_t *desc) {
   PARAM_CHECK("[mluOpCreateTensorDescriptor]", desc != NULL);
 
 #if MLUOP_TENSOR_QUEUE_ENABLE
-  size_t id = hasher(std::this_thread::get_id()) % QUEUE_ARRAY_LENGTH;
-  queue_array[id].lock();
-  if (MLUOP_PREDICT_FALSE(queue_array[id].queue.empty())) {
-    queue_array[id].extend(queue_array[id].extend_num);
-    queue_array[id].extend_num *= 2;
+  queue_array.lock();
+  if MLUOP_PREDICT_FALSE (queue_array.queue.empty()) {
+    queue_array.extend(queue_array.extend_num);
+    queue_array.extend_num *= 2;
   }
-  *desc = queue_array[id].queue.front();
-  queue_array[id].queue.pop();
-  queue_array[id].unlock();
+  *desc = ::new (queue_array.queue.front()) mluOpTensorStruct;
+  queue_array.queue.pop_front();
+  queue_array.unlock();
 #else
-  mluOpTensorStruct *ts = new (std::nothrow) mluOpTensorStruct();
+  mluOpTensorStruct *ts = new (std::nothrow) mluOpTensorStruct;
   *desc = ts;
 #endif
 
@@ -341,32 +362,29 @@ mluOpStatus_t MLUOP_WIN_API mluOpCreateGroupTensorDescriptors(
   PARAM_CHECK("[mluOpCreateGroupTensorDescriptors]", desc_num > 0);
 
 #if MLUOP_TENSOR_QUEUE_ENABLE
-  size_t id = hasher(std::this_thread::get_id()) % QUEUE_ARRAY_LENGTH;
-  queue_array[id].lock();
-  if (MLUOP_PREDICT_FALSE(queue_array[id].queue.empty() ||
-                          (size_t)desc_num >
-                              (size_t)queue_array[id].queue.size())) {
-    queue_array[id].extend(
-        std::max((size_t)queue_array[id].extend_num, (size_t)desc_num));
-    queue_array[id].extend_num =
-        2 * std::max((size_t)queue_array[id].extend_num, (size_t)desc_num);
+  queue_array.lock();
+  if MLUOP_PREDICT_FALSE (queue_array.queue.size() < desc_num) {
+    queue_array.extend(std::max(queue_array.extend_num, (size_t)desc_num));
+    queue_array.extend_num =
+        2 * std::max(queue_array.extend_num, (size_t)desc_num);
   }
   for (int i = 0; i < desc_num; ++i) {
-    *(group_desc[i]) = queue_array[id].queue.front();
-    queue_array[id].queue.pop();
+    *(group_desc[i]) = queue_array.queue.front();
+    queue_array.queue.pop_front();
   }
-  queue_array[id].unlock();
+  queue_array.unlock();
 #else
   for (int i = 0; i < desc_num; ++i) {
-    mluOpTensorStruct *ts = new (std::nothrow) mluOpTensorStruct();
-    *(group_desc[i]) = ts;
+    mluOpTensorStruct *ts = new (std::nothrow) mluOpTensorStruct;
+    group_desc[i][0] = ts;
   }
 #endif
 
   return MLUOP_STATUS_SUCCESS;
 }
 
-mluOpStatus_t mluOpSetTensorDescriptorZeroDim(mluOpTensorDescriptor_t desc) {
+static inline mluOpStatus_t mluOpSetTensorDescriptorZeroDim(
+    mluOpTensorDescriptor_t desc) {
   if (desc->pointer_mode == MLUOP_POINTER_MODE_HOST) {
     desc->dim = 0;
     desc->total_element_num = 1;
@@ -422,32 +440,23 @@ mluOpStatus_t MLUOP_WIN_API mluOpSetTensorDescriptor_v2(
   }
 }
 
-mluOpStatus_t mluOpSetTensorDescriptorDimBase(mluOpTensorDescriptor_t desc,
-                                              int dimNb, const void *dimSize) {
-  PARAM_CHECK("[mluOpSetTensorDescriptorDim]", desc != NULL);
-  PARAM_CHECK("[mluOpSetTensorDescriptorDim]", dimNb > 0);
-  PARAM_CHECK("[mluOpSetTensorDescriptorDim]", dimSize != NULL);
-
-  desc->dim = dimNb;
-
-  if (MLUOP_PREDICT_FALSE(desc->larger_dims != NULL)) {
-    delete[] desc->larger_dims;
-    desc->larger_dims = NULL;
-  }
-  if (MLUOP_PREDICT_FALSE(desc->larger_strides != NULL)) {
-    delete[] desc->larger_strides;
-    desc->larger_strides = NULL;
-  }
-  if (MLUOP_PREDICT_FALSE(dimNb > MLUOP_DIM_MAX)) {
-    desc->larger_dims = new (std::nothrow) int64_t[dimNb];
-    desc->larger_strides = new (std::nothrow) int64_t[dimNb];
-    desc->dims = desc->larger_dims;
-    desc->strides = desc->larger_strides;
-  } else {
-    desc->dims = desc->normal_dims;
-    desc->strides = desc->normal_strides;
+// Internal interface. Caller should guarantee parameter validity.
+static inline void mluOpSetTensorDescriptorDimBase(mluOpTensorDescriptor_t desc,
+                                                   int dimNb) {
+  if (dimNb != desc->dim) {
+    if MLUOP_PREDICT_FALSE (desc->dims != desc->normal_dims) {
+      delete[] desc->dims;
+      delete[] desc->strides;
+    }
+    if MLUOP_PREDICT_FALSE (dimNb > MLUOP_DIM_MAX) {
+      desc->dims = new (std::nothrow) int64_t[dimNb];
+      desc->strides = new (std::nothrow) int64_t[dimNb];
+    } else {
+      desc->dims = desc->normal_dims;
+      desc->strides = desc->normal_strides;
+    }
+    desc->dim = dimNb;
   }
-  return MLUOP_STATUS_SUCCESS;
 }
 
 mluOpStatus_t MLUOP_WIN_API mluOpSetTensorDescriptorDim(
@@ -456,8 +465,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpSetTensorDescriptorDim(
     CHECK_RETURN("[mluOpSetTensorDescriptorDim]",
                  mluOpSetTensorDescriptorZeroDim(desc));
   } else {
-    CHECK_RETURN("[mluOpSetTensorDescriptorDim]",
-                 mluOpSetTensorDescriptorDimBase(desc, dimNb, (void *)dimSize));
+    mluOpSetTensorDescriptorDimBase(desc, dimNb);
     std::copy(dimSize, dimSize + dimNb, desc->dims);
   }
 
@@ -493,8 +501,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpSetTensorDescriptorDim(
 
 mluOpStatus_t MLUOP_WIN_API mluOpSetTensorDescriptorDim_v2(
     mluOpTensorDescriptor_t desc, int dimNb, const int64_t *dimSize) {
-  CHECK_RETURN("[mluOpSetTensorDescriptorDim]",
-               mluOpSetTensorDescriptorDimBase(desc, dimNb, (void *)dimSize));
+  mluOpSetTensorDescriptorDimBase(desc, dimNb);
 
   memcpy(desc->dims, dimSize, dimNb * sizeof(int64_t));
 
@@ -541,34 +548,30 @@ mluOpStatus_t MLUOP_WIN_API mluOpSetGroupTensorDescriptors(
 
   int group_dimSize_iterator = 0;
   for (int i = 0; i < desc_num; ++i) {
-    (*(group_desc[i]))->dim = group_dimNb[i];
-    (*(group_desc[i]))->dtype = group_dtype[i];
-    (*(group_desc[i]))->layout = group_layout[i];
+    group_desc[i][0]->dim = group_dimNb[i];
+    group_desc[i][0]->dtype = group_dtype[i];
+    group_desc[i][0]->layout = group_layout[i];
 
     if (MLUOP_PREDICT_FALSE(group_dimNb[i] > MLUOP_DIM_MAX)) {
-      (*(group_desc[i]))->larger_dims =
-          new (std::nothrow) int64_t[group_dimNb[i]];
-      (*(group_desc[i]))->larger_strides =
-          new (std::nothrow) int64_t[group_dimNb[i]];
-      (*(group_desc[i]))->dims = (*(group_desc[i]))->larger_dims;
-      (*(group_desc[i]))->strides = (*(group_desc[i]))->larger_strides;
+      group_desc[i][0]->dims = new (std::nothrow) int64_t[group_dimNb[i]];
+      group_desc[i][0]->strides = new (std::nothrow) int64_t[group_dimNb[i]];
     } else {
-      (*(group_desc[i]))->dims = (*(group_desc[i]))->normal_dims;
-      (*(group_desc[i]))->strides = (*(group_desc[i]))->normal_strides;
+      group_desc[i][0]->dims = group_desc[i][0]->normal_dims;
+      group_desc[i][0]->strides = group_desc[i][0]->normal_strides;
     }
     std::copy(group_dimSize + group_dimSize_iterator,
               group_dimSize + group_dimSize_iterator + group_dimNb[i],
-              (*(group_desc[i]))->dims);
+              group_desc[i][0]->dims);
 
     // infer strides of dimNb dimensions and compute total_num and total_size
     int strideBase = 1;
     for (int j = group_dimNb[i] - 1; j >= 0; --j) {
-      (*(group_desc[i]))->strides[j] = strideBase;
-      strideBase *= (*(group_desc[i]))->dims[j];
+      group_desc[i][0]->strides[j] = strideBase;
+      strideBase *= group_desc[i][0]->dims[j];
     }
-    (*(group_desc[i]))->total_element_num = strideBase;
-    (*(group_desc[i]))->total_tensor_size =
-        (*(group_desc[i]))->total_element_num *
+    group_desc[i][0]->total_element_num = strideBase;
+    group_desc[i][0]->total_tensor_size =
+        group_desc[i][0]->total_element_num *
         mluop::getSizeOfDataType(group_dtype[i]);
 
     // compute new iterator for next loop.
@@ -591,33 +594,29 @@ mluOpStatus_t MLUOP_WIN_API mluOpSetGroupTensorDescriptors_v2(
 
   int group_dimSize_iterator = 0;
   for (int i = 0; i < desc_num; ++i) {
-    (*(group_desc[i]))->dim = group_dimNb[i];
-    (*(group_desc[i]))->dtype = group_dtype[i];
-    (*(group_desc[i]))->layout = group_layout[i];
+    group_desc[i][0]->dim = group_dimNb[i];
+    group_desc[i][0]->dtype = group_dtype[i];
+    group_desc[i][0]->layout = group_layout[i];
 
     if (MLUOP_PREDICT_FALSE(group_dimNb[i] > MLUOP_DIM_MAX)) {
-      (*(group_desc[i]))->larger_dims =
-          new (std::nothrow) int64_t[group_dimNb[i]];
-      (*(group_desc[i]))->larger_strides =
-          new (std::nothrow) int64_t[group_dimNb[i]];
-      (*(group_desc[i]))->dims = (*(group_desc[i]))->larger_dims;
-      (*(group_desc[i]))->strides = (*(group_desc[i]))->larger_strides;
+      group_desc[i][0]->dims = new (std::nothrow) int64_t[group_dimNb[i]];
+      group_desc[i][0]->strides = new (std::nothrow) int64_t[group_dimNb[i]];
     } else {
-      (*(group_desc[i]))->dims = (*(group_desc[i]))->normal_dims;
-      (*(group_desc[i]))->strides = (*(group_desc[i]))->normal_strides;
+      group_desc[i][0]->dims = group_desc[i][0]->normal_dims;
+      group_desc[i][0]->strides = group_desc[i][0]->normal_strides;
     }
-    memcpy((*(group_desc[i]))->dims, group_dimSize + group_dimSize_iterator,
+    memcpy(group_desc[i][0]->dims, group_dimSize + group_dimSize_iterator,
            group_dimNb[i] * sizeof(int64_t));
 
     // infer strides of dimNb dimensions and compute total_num and total_size
     int strideBase = 1;
     for (int j = group_dimNb[i] - 1; j >= 0; --j) {
-      (*(group_desc[i]))->strides[j] = strideBase;
-      strideBase *= (*(group_desc[i]))->dims[j];
+      group_desc[i][0]->strides[j] = strideBase;
+      strideBase *= group_desc[i][0]->dims[j];
     }
-    (*(group_desc[i]))->total_element_num = strideBase;
-    (*(group_desc[i]))->total_tensor_size =
-        (*(group_desc[i]))->total_element_num *
+    group_desc[i][0]->total_element_num = strideBase;
+    group_desc[i][0]->total_tensor_size =
+        group_desc[i][0]->total_element_num *
         mluop::getSizeOfDataType(group_dtype[i]);
 
     // compute new iterator for next loop.
@@ -630,7 +629,28 @@ mluOpStatus_t MLUOP_WIN_API mluOpSetGroupTensorDescriptors_v2(
 mluOpStatus_t MLUOP_WIN_API
 mluOpResetTensorDescriptor(mluOpTensorDescriptor_t desc) {
   PARAM_CHECK("[mluOpResetTensorDescriptor]", desc != NULL);
-  desc->reset();
+
+  if MLUOP_PREDICT_FALSE (desc->dims != desc->normal_dims) {
+    delete[] desc->dims;
+    desc->dims = desc->normal_dims;
+  }
+  if MLUOP_PREDICT_FALSE (desc->strides != desc->normal_strides) {
+    delete[] desc->strides;
+    desc->strides = desc->normal_strides;
+  }
+
+  desc->dim = 0;
+  desc->dtype = MLUOP_DTYPE_FLOAT;
+  desc->onchip_dtype = MLUOP_DTYPE_INVALID;
+  desc->layout = MLUOP_LAYOUT_ARRAY;
+  desc->pointer_mode = MLUOP_POINTER_MODE_DEVICE;
+
+  desc->total_element_num = 0;
+  desc->total_tensor_size = 0;
+
+  desc->position = 0;
+  desc->scale = 1.0f;
+  desc->offset = 0;
 
   return MLUOP_STATUS_SUCCESS;
 }
@@ -653,7 +673,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpSetTensorDescriptorEx(
     PARAM_CHECK("[mluOpSetTensorDescriptorEx]", dimStride != NULL);
     PARAM_CHECK("[mluOpSetTensorDescriptorEx]", dimNb > 0);
 
-    mluOpSetTensorDescriptorDimBase(desc, dimNb, (void *)dimSize);
+    mluOpSetTensorDescriptorDimBase(desc, dimNb);
     std::copy(dimSize, dimSize + dimNb, desc->dims);
     std::copy(dimStride, dimStride + dimNb, desc->strides);
 
@@ -680,14 +700,13 @@ mluOpStatus_t MLUOP_WIN_API mluOpSetTensorDescriptorEx_v2(
   desc->dtype = dtype;
   desc->layout = layout;
 
-  if (dimNb == 0) {
+  if MLUOP_PREDICT_FALSE (dimNb == 0) {
     return mluOpSetTensorDescriptorZeroDim(desc);
   } else {
     PARAM_CHECK("[mluOpSetTensorDescriptorEx]", dimSize != NULL);
     PARAM_CHECK("[mluOpSetTensorDescriptorEx]", dimStride != NULL);
-    PARAM_CHECK("[mluOpSetTensorDescriptorEx]", dimNb > 0);
 
-    mluOpSetTensorDescriptorDimBase(desc, dimNb, (void *)dimSize);
+    mluOpSetTensorDescriptorDimBase(desc, dimNb);
     memcpy(desc->dims, dimSize, dimNb * sizeof(int64_t));
     memcpy(desc->strides, dimStride, dimNb * sizeof(int64_t));
 
@@ -872,13 +891,12 @@ mluOpStatus_t MLUOP_WIN_API mluOpGetTensorDescriptorPointerMode(
 mluOpStatus_t MLUOP_WIN_API
 mluOpDestroyTensorDescriptor(mluOpTensorDescriptor_t desc) {
   PARAM_CHECK("[mluOpDestroyTensorDescriptor]", desc != NULL);
-  desc->reset();
 
 #if MLUOP_TENSOR_QUEUE_ENABLE
-  size_t id = hasher(std::this_thread::get_id()) % QUEUE_ARRAY_LENGTH;
-  queue_array[id].lock();
-  queue_array[id].queue.emplace(desc);
-  queue_array[id].unlock();
+  queue_array.lock();
+  desc->~mluOpTensorStruct();
+  queue_array.queue.push_front(desc);
+  queue_array.unlock();
 #else
   delete desc;
 #endif
@@ -892,17 +910,15 @@ mluOpStatus_t MLUOP_WIN_API mluOpDestroyGroupTensorDescriptors(
   PARAM_CHECK("[mluOpDestroyGroupTensorDescriptors]", desc_num > 0);
 
 #if MLUOP_TENSOR_QUEUE_ENABLE
-  size_t id = hasher(std::this_thread::get_id()) % QUEUE_ARRAY_LENGTH;
-  queue_array[id].lock();
+  queue_array.lock();
   for (int i = 0; i < desc_num; ++i) {
-    (*(group_desc[i]))->reset();
-    queue_array[id].queue.emplace(*(group_desc[i]));
+    group_desc[i][0]->~mluOpTensorStruct();
+    queue_array.queue.push_front(group_desc[i][0]);
   }
-  queue_array[id].unlock();
+  queue_array.unlock();
 #else
   for (int i = 0; i < desc_num; ++i) {
-    (*(group_desc[i]))->reset();
-    delete (*(group_desc[i]));
+    delete group_desc[i][0];
   }
 #endif
 
@@ -913,9 +929,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpDestroyGroupTensorDescriptors(
 uint64_t MLUOP_WIN_API
 mluOpGetTensorElementNum(const mluOpTensorDescriptor_t desc) {
   CHECK(desc != NULL);
-  uint64_t tensor_num = 1;
-  auto return_status = desc->tensorElementsNumber(tensor_num);
-  return tensor_num;
+  return desc->total_element_num;
 }
 
 uint64_t mluOpGetSeqDataElementNum(mluOpSeqDataDescriptor_t desc) {
diff --git a/core/tensor.h b/core/tensor.h
index e237a15f6..38fba2a67 100644
--- a/core/tensor.h
+++ b/core/tensor.h
@@ -38,101 +38,103 @@
 
 #define QUEUE_ARRAY_LENGTH 4
 
-struct mluOpTensorStruct {
-  mluOpTensorStruct()
-      : dim(0),
-        dtype(MLUOP_DTYPE_FLOAT),
-        onchip_dtype(MLUOP_DTYPE_INVALID),
-        layout(MLUOP_LAYOUT_ARRAY),
-        pointer_mode(MLUOP_POINTER_MODE_DEVICE),
-        position(0),
-        scale(1.0),
-        offset(0) {
-    /* explicit set initial values for document use.
-     */
-  }
+struct alignas(64) mluOpTensorStruct {
+  /** default constructor */
+  mluOpTensorStruct() = default;
+
+  /** copy constructor */
+  mluOpTensorStruct(mluOpTensorStruct const &other) { *this = other; }
+
+  /** move constructor */
+  mluOpTensorStruct(mluOpTensorStruct const &&) = delete;
+
+  /** destructor */
   ~mluOpTensorStruct() {
-    /* please do NOT implement any codes here.
-     * a state-less struct should not hold any resources.
-     */
+    if MLUOP_PREDICT_FALSE (dims != normal_dims) {
+      delete[] dims;
+    }
+    if MLUOP_PREDICT_FALSE (strides != normal_strides) {
+      delete[] strides;
+    }
   }
+
+  /** copy assignment operator */
+  mluOpTensorStruct &operator=(mluOpTensorStruct const &other) {
+    if (dim > MLUOP_DIM_MAX && (dim < other.dim || other.dim < MLUOP_DIM_MAX)) {
+      delete[] dims;
+      delete[] strides;
+      if (other.dim < MLUOP_DIM_MAX) {
+        dims = normal_dims;
+        strides = normal_strides;
+      } else {
+        dims = new (std::nothrow) int64_t[dim];
+        strides = new (std::nothrow) int64_t[dim];
+      }
+    }
+
+    dim = other.dim;
+    dtype = other.dtype;
+    layout = other.layout;
+    onchip_dtype = other.onchip_dtype;
+    pointer_mode = other.pointer_mode;
+
+    total_element_num = other.total_element_num;
+    total_tensor_size = other.total_tensor_size;
+
+    memcpy(dims, other.dims, sizeof(int64_t) * dim);
+    memcpy(strides, other.strides, sizeof(int64_t) * dim);
+
+    position = other.position;
+    scale = other.scale;
+    offset = other.offset;
+
+    positions = other.positions;
+    scales = other.scales;
+    offsets = other.offsets;
+
+    return *this;
+  }
+
+  mluOpTensorStruct &operator=(mluOpTensorStruct const &&other) = delete;
+
   /* methods */
   mluOpStatus_t tensorDimN(size_t &dim);
   mluOpStatus_t tensorDimC(size_t &dim);
   mluOpStatus_t tensorDimH(size_t &dim);
   mluOpStatus_t tensorDimW(size_t &dim);
-  inline mluOpStatus_t tensorElementsNumber(size_t &elements) const {
-    elements = total_element_num;
-    return MLUOP_STATUS_SUCCESS;
-  }
-  inline mluOpStatus_t tensorSize(size_t &tensor_size) const {
-    tensor_size = total_tensor_size;
-    return MLUOP_STATUS_SUCCESS;
-  }
+
   inline bool isSameDims(const mluOpTensorStruct &other) const;
   inline bool isSameDims(const mluOpTensorStruct *other) const;
   inline bool isCpuScalar() const;
 
-  /* struct */
-  int dim = 0;
+  /* Try to pack and align the struct */
+  /*  ------------------- 64 Bytes - 1 -------------------*/
+  int64_t normal_dims[MLUOP_DIM_MAX];
+
+  /*  ------------------- 64 Bytes - 2 -------------------*/
+  int64_t normal_strides[MLUOP_DIM_MAX];
+
+  /*  ------------------- 64 Bytes - 3 -------------------*/
+  /* Offset - 0 */
   uint64_t total_element_num = 0;
   uint64_t total_tensor_size = 0;
-  // if dimNb > MLUOP_DIM_MAX (8), using larger_dims, malloc it and dims point
-  // it. else, using normal_dims, dont need malloc and free.
-  int64_t normal_dims[MLUOP_DIM_MAX] = {-1};
-  int64_t *larger_dims = NULL;
-  int64_t *dims = normal_dims;  // point the normal dims as default
-
-  int64_t normal_strides[MLUOP_DIM_MAX] = {-1};
-  int64_t *larger_strides = NULL;
+  int64_t *dims = normal_dims;        // point the normal dims as default
   int64_t *strides = normal_strides;  // point the normal strides as default
-
-  mluOpDataType_t dtype;
-  mluOpDataType_t onchip_dtype;
-  mluOpTensorLayout_t layout;
-  mluOpPointerMode_t pointer_mode;
-  int position;
-  float scale;
-  int offset;
+  /* Offset - 32 */
+  int dim = 0;
+  mluOpDataType_t dtype = MLUOP_DTYPE_FLOAT;
+  mluOpDataType_t onchip_dtype = MLUOP_DTYPE_INVALID;
+  mluOpTensorLayout_t layout = MLUOP_LAYOUT_ARRAY;
+  mluOpPointerMode_t pointer_mode = MLUOP_POINTER_MODE_DEVICE;
+
+  /* Offset - 52 */
+  /* To be removed*/
+  int position = 0;
+  float scale = 1;
+  int offset = 0;
   std::vector<int> positions;
   std::vector<float> scales;
   std::vector<int> offsets;
-  inline void init() {  // reset random value after malloc.
-    // init these pointer.
-    // if not, when call reset() will free invalid pointer.
-    larger_dims = NULL;
-    larger_strides = NULL;
-
-    dim = 0;
-    total_element_num = 0;
-    total_tensor_size = 0;
-    dims = normal_dims;
-    strides = normal_strides;
-  }
-  inline void reset() {  // reset variable as default.
-    if (MLUOP_PREDICT_FALSE(larger_dims != NULL)) {
-      delete[] larger_dims;
-      larger_dims = NULL;
-    }
-    if (MLUOP_PREDICT_FALSE(larger_strides != NULL)) {
-      delete[] larger_strides;
-      larger_strides = NULL;
-    }
-    dims = normal_dims;
-    strides = normal_strides;
-    dtype = MLUOP_DTYPE_FLOAT;
-    onchip_dtype = MLUOP_DTYPE_INVALID;
-    layout = MLUOP_LAYOUT_ARRAY;
-    pointer_mode = MLUOP_POINTER_MODE_DEVICE;
-
-    position = 0;
-    scale = 1.0f;
-    offset = 0;
-
-    dim = 0;
-    total_element_num = 0;
-    total_tensor_size = 0;
-  }
 };
 
 // dim_set(rnn)     [layer_num, direction, cap_of_cell]
@@ -156,9 +158,7 @@ struct mluOpTensorSetStruct {
     CHECK(!this->tensor_set.empty());
     size_t tensor_set_size = 0;
     for (int i = 0; i < tensor_set.size(); i++) {
-      size_t size = 0;
-      tensor_set[i]->tensorSize(size);
-      tensor_set_size += size;
+      tensor_set_size += tensor_set[i]->total_tensor_size;
     }
     return tensor_set_size;
   }
@@ -175,9 +175,7 @@ struct mluOpTensorSetStruct {
     int64_t offset = 0;
     int index = this->getIndex(tensorIndex);
     for (int i = 0; i < index; i++) {
-      size_t ts_size = 0;
-      this->tensor_set[i]->tensorSize(ts_size);
-      offset += ts_size;
+      offset += tensor_set[i]->total_tensor_size;
     }
     data_offset[index] = offset;
     return offset;
@@ -220,9 +218,7 @@ struct mluOpTensorSetStruct {
     int offset = 0;
     data_offset[0] = offset;
     for (int i = 0; i < tensor_num - 1; i++) {
-      size_t ts_size = 0;
-      this->tensor_set[i]->tensorSize(ts_size);
-      offset += ts_size;
+      offset += tensor_set[i]->total_tensor_size;
       data_offset[i + 1] = offset;
     }
     return data_offset;
@@ -247,7 +243,7 @@ struct mluOpSeqDataStruct {
         position(0),
         scale(1.0),
         offset(0),
-        padding_fill(NULL) {
+        padding_fill(nullptr) {
     /* explicit set initial values for document use.
      */
   }
@@ -287,48 +283,6 @@ struct mluOpSeqDataStruct {
   void *padding_fill;
 };
 
-#ifndef MLUOP_TENSOR_QUEUE_ENABLE
-#define MLUOP_TENSOR_QUEUE_ENABLE 1
-#endif
-
-#if MLUOP_TENSOR_QUEUE_ENABLE
-struct mluOpTensorDescriptorQueueStruct {
-  mluOpTensorDescriptorQueueStruct() {
-    extend(extend_num);
-    extend_num *= 2;
-  }
-  explicit mluOpTensorDescriptorQueueStruct(size_t n) {
-    extend_num = n;
-    extend(extend_num);
-    extend_num *= 2;
-  }
-  ~mluOpTensorDescriptorQueueStruct() {
-    for (auto it : this->headers) {
-      delete[] it;
-    }
-  }
-  std::queue<mluOpTensorDescriptor_t> queue;
-  std::list<mluOpTensorStruct *> headers;
-  std::atomic_flag flag = ATOMIC_FLAG_INIT;
-  inline void lock() {
-    while (flag.test_and_set(std::memory_order_acquire)) {
-      std::this_thread::yield();
-    }
-  }
-  inline void unlock() { flag.clear(std::memory_order_release); }
-  inline void extend(size_t n) {
-    mluOpTensorStruct *header = new (std::nothrow) mluOpTensorStruct[n];
-    headers.emplace_back(header);
-    for (size_t i = 0; i < n; ++i) {
-      mluOpTensorStruct *desc = header + i;
-      desc->init();  // reset random value.
-      queue.emplace(desc);
-    }
-  }
-  size_t extend_num = 100;
-};
-#endif
-
 inline int mluOpDataTypeBytes(const mluOpDataType_t dt) {
   return mluop::getSizeOfDataType(dt);
 }
diff --git a/core/type.cpp b/core/type.cpp
index f135ca057..a6d2d8a88 100644
--- a/core/type.cpp
+++ b/core/type.cpp
@@ -84,36 +84,4 @@ std::string MLUOP_WIN_API MLUOP_ATTRIBUTE_FLATTEN getNameOfTensorLayout(mluOpTen
   return mluOpGetNameOfTensorLayout(layout);
 }
 
-size_t getSizeOfDataType(mluOpDataType_t dtype) {
-  switch (dtype) {
-    default: {
-      return 0;
-    }
-    case MLUOP_DTYPE_BOOL:
-    case MLUOP_DTYPE_INT8:
-    case MLUOP_DTYPE_UINT8: {
-      return 1;
-    }
-    case MLUOP_DTYPE_INT16:
-    case MLUOP_DTYPE_UINT16:
-    case MLUOP_DTYPE_HALF:
-    case MLUOP_DTYPE_BFLOAT16: {
-      return 2;
-    }
-    case MLUOP_DTYPE_INT31:
-    case MLUOP_DTYPE_INT32:
-    case MLUOP_DTYPE_UINT32:
-    case MLUOP_DTYPE_FLOAT:
-    case MLUOP_DTYPE_COMPLEX_HALF: {
-      return 4;
-    }
-    case MLUOP_DTYPE_UINT64:
-    case MLUOP_DTYPE_INT64:
-    case MLUOP_DTYPE_DOUBLE:
-    case MLUOP_DTYPE_COMPLEX_FLOAT: {
-      return 8;
-    }
-  }
-}
-
 }  // namespace mluop
diff --git a/core/type.h b/core/type.h
index bfabbd21d..46122e86b 100644
--- a/core/type.h
+++ b/core/type.h
@@ -52,9 +52,38 @@ static mluOpStatus_t getLowAndHighValueFrom64Bits(T value, uint32_t* high,
   return MLUOP_STATUS_SUCCESS;
 }
 
-// TODO(None) use const char * instead of std::string (need mluop_extra
-// fix first) hide visibility
-size_t MLUOP_WIN_API getSizeOfDataType(mluOpDataType_t dtype);
+static inline size_t MLUOP_WIN_API getSizeOfDataType(mluOpDataType_t dtype) {
+  switch (dtype) {
+    default: {
+      return 0;
+    }
+    case MLUOP_DTYPE_BOOL:
+    case MLUOP_DTYPE_INT8:
+    case MLUOP_DTYPE_UINT8: {
+      return 1;
+    }
+    case MLUOP_DTYPE_INT16:
+    case MLUOP_DTYPE_UINT16:
+    case MLUOP_DTYPE_HALF:
+    case MLUOP_DTYPE_BFLOAT16: {
+      return 2;
+    }
+    case MLUOP_DTYPE_INT31:
+    case MLUOP_DTYPE_INT32:
+    case MLUOP_DTYPE_UINT32:
+    case MLUOP_DTYPE_FLOAT:
+    case MLUOP_DTYPE_COMPLEX_HALF: {
+      return 4;
+    }
+    case MLUOP_DTYPE_UINT64:
+    case MLUOP_DTYPE_INT64:
+    case MLUOP_DTYPE_DOUBLE:
+    case MLUOP_DTYPE_COMPLEX_FLOAT: {
+      return 8;
+    }
+  }
+}
+
 std::string MLUOP_WIN_API getNameOfDataType(mluOpDataType_t dtype);  // NOLINT
 std::string MLUOP_WIN_API getNameOfTensorLayout(mluOpTensorLayout_t layout);  // NOLINT
 
diff --git a/kernels/ball_query/ball_query.cpp b/kernels/ball_query/ball_query.cpp
index a76ac7342..f3bbf4fa2 100644
--- a/kernels/ball_query/ball_query.cpp
+++ b/kernels/ball_query/ball_query.cpp
@@ -51,11 +51,7 @@ void policyFuncBallQuery(const mluOpHandle_t &handle,
   size_t core_in_cluster = handle->core_num_per_cluster;
   VLOG(5) << "In current device, core_in_cluster:" << core_in_cluster;
 
-  size_t total_data_num;
-  if (desc->tensorElementsNumber(total_data_num) != MLUOP_STATUS_SUCCESS) {
-    LOG(ERROR) << "[mluOpBallQuery], In policyFuncBallQuery function, fail to "
-                  "get elem_count";
-  }
+  size_t total_data_num = desc->total_element_num;
 
   // On a core, a lot of new_xyz data element can be stored; but only one data
   // element can be processed at a time. So a cluster can only process four data
diff --git a/kernels/sparse_conv/get_indice_pairs/normal_get_indice_pairs.cpp b/kernels/sparse_conv/get_indice_pairs/normal_get_indice_pairs.cpp
index 7dacdc153..ca2fecd21 100644
--- a/kernels/sparse_conv/get_indice_pairs/normal_get_indice_pairs.cpp
+++ b/kernels/sparse_conv/get_indice_pairs/normal_get_indice_pairs.cpp
@@ -39,8 +39,8 @@ static mluOpStatus_t getIndiceMaskAll(
     const mluOpTensorDescriptor_t indice_pairs_desc, const int kernel_volume,
     const int input_active_site, size_t *size) {
   size_t total_size = 0;
-  total_size =
-      kernel_volume * input_active_site * sizeof(indice_pairs_desc->dtype);
+  total_size = kernel_volume * input_active_site *
+               mluop::getSizeOfDataType(indice_pairs_desc->dtype);
   size[0] = total_size;
   return MLUOP_STATUS_SUCCESS;
 }
@@ -50,7 +50,8 @@ static mluOpStatus_t getIndiceIndexIn(
     const int input_active_site, size_t *size) {
   size_t total_size = 0;
   total_size =
-      kernel_volume * input_active_site * sizeof(indice_pairs_desc->dtype);
+      kernel_volume * input_active_site *
+      mluop::getSizeOfDataType(indice_pairs_desc->dtype);
   size[0] = total_size;
   return MLUOP_STATUS_SUCCESS;
 }
@@ -60,7 +61,8 @@ static mluOpStatus_t getIndiceIndexOut(
     const int input_active_site, size_t *size) {
   size_t total_size = 0;
   total_size =
-      kernel_volume * input_active_site * sizeof(indice_pairs_desc->dtype);
+      kernel_volume * input_active_site *
+      mluop::getSizeOfDataType(indice_pairs_desc->dtype);
   size[0] = total_size;
   return MLUOP_STATUS_SUCCESS;
 }
@@ -70,7 +72,8 @@ static mluOpStatus_t getIndiceOutExpand(
     const int input_active_site, size_t *size) {
   size_t total_size = 0;
   total_size =
-      kernel_volume * input_active_site * sizeof(indice_pairs_desc->dtype);
+      kernel_volume * input_active_site *
+      mluop::getSizeOfDataType(indice_pairs_desc->dtype);
   size[0] = total_size;
   return MLUOP_STATUS_SUCCESS;
 }
@@ -79,7 +82,8 @@ static mluOpStatus_t getIndiceInExpand(
     const mluOpTensorDescriptor_t indice_pairs_desc,
     const int input_active_site, size_t *size) {
   size_t total_size = 0;
-  total_size = input_active_site * sizeof(indice_pairs_desc->dtype);
+  total_size = input_active_site *
+  mluop::getSizeOfDataType(indice_pairs_desc->dtype);
   size[0] = total_size;
   return MLUOP_STATUS_SUCCESS;
 }
@@ -89,7 +93,7 @@ static mluOpStatus_t getIndiceUnique(
     const int input_active_site, size_t *size) {
   size_t total_size = 0;
   total_size = (kernel_volume * input_active_site + 1) *
-               sizeof(indice_pairs_desc->dtype);
+               mluop::getSizeOfDataType(indice_pairs_desc->dtype);
   size[0] = total_size;
   return MLUOP_STATUS_SUCCESS;
 }
@@ -97,7 +101,7 @@ static mluOpStatus_t getIndiceUnique(
 static mluOpStatus_t getGridOut(const mluOpTensorDescriptor_t indice_pairs_desc,
                                 int output_size, size_t *size) {
   size_t total_size = 0;
-  total_size = output_size * sizeof(indice_pairs_desc->dtype);
+  total_size = output_size * mluop::getSizeOfDataType(indice_pairs_desc->dtype);
   size[0] = total_size;
   return MLUOP_STATUS_SUCCESS;
 }