From 8478a9816ecd2553c37c45f1d55babf410feb9ca Mon Sep 17 00:00:00 2001 From: nth-BYTE <160582271+nth-BYTE@users.noreply.github.com> Date: Wed, 25 Sep 2024 16:50:59 +0800 Subject: [PATCH] [Feature](mlu-ops): optimization for descritpor (#1083) Co-authored-by: tonghengwen --- core/logging.h | 2 +- core/tensor.cpp | 256 +++++++++--------- core/tensor.h | 214 ++++++--------- core/type.cpp | 32 --- core/type.h | 35 ++- kernels/ball_query/ball_query.cpp | 6 +- .../normal_get_indice_pairs.cpp | 20 +- 7 files changed, 265 insertions(+), 300 deletions(-) diff --git a/core/logging.h b/core/logging.h index f924fcedf..25bfd85e8 100644 --- a/core/logging.h +++ b/core/logging.h @@ -122,7 +122,7 @@ } #define PARAM_CHECK(api, condition, ...) \ - if (!(condition)) { \ + if MLUOP_PREDICT_FALSE (!(condition)) { \ LOG(ERROR) << api << " Check failed: " #condition ". " #__VA_ARGS__; \ return MLUOP_STATUS_BAD_PARAM; \ } diff --git a/core/tensor.cpp b/core/tensor.cpp index 5f0405146..cd9602fb2 100644 --- a/core/tensor.cpp +++ b/core/tensor.cpp @@ -22,7 +22,7 @@ *************************************************************************/ #include #include - +#include #include "core/tensor.h" #include "core/logging.h" #include "core/type.h" @@ -294,41 +294,62 @@ mluOpDestroySeqDataDescriptor(mluOpSeqDataDescriptor_t seq_data_desc) { return MLUOP_STATUS_SUCCESS; } -#if MLUOP_TENSOR_QUEUE_ENABLE -static mluOpTensorDescriptorQueueStruct *queue_array = NULL; -static std::hash hasher; +namespace { + +#define MLUOP_TENSOR_QUEUE_ENABLE 1 -MLUOP_ATTRIBUTE_CONSTRUCTOR MLUOP_ATTRIBUTE_VISIBILITY_HIDDEN void mluOpInit() { - if (!queue_array) { - queue_array = - new (std::nothrow) mluOpTensorDescriptorQueueStruct[QUEUE_ARRAY_LENGTH]; +#if MLUOP_TENSOR_QUEUE_ENABLE +struct mluOpTensorDescriptorQueueStruct { + mluOpTensorDescriptorQueueStruct() { + extend(extend_num); + extend_num *= 2; } -} + explicit mluOpTensorDescriptorQueueStruct(size_t n) { + extend_num = n; + extend(extend_num); + extend_num *= 2; + } + + // Let the OS do the cleanup since it's a global variable + ~mluOpTensorDescriptorQueueStruct() {} -MLUOP_ATTRIBUTE_DESTRUCTOR MLUOP_ATTRIBUTE_VISIBILITY_HIDDEN void mluOpExit() { - if (queue_array) { - delete[] queue_array; - queue_array = NULL; + inline void lock() { + while (flag.test_and_set(std::memory_order_acquire)) { + } } -} + inline void unlock() { flag.clear(std::memory_order_release); } + inline void extend(size_t n) { + mluOpTensorStruct *header = new (std::nothrow) mluOpTensorStruct[n]; + for (size_t i = 0; i < n; ++i) { + mluOpTensorStruct *desc = header + i; + queue.push_front(desc); + } + } + size_t extend_num = 128; + std::deque queue; + std::atomic_flag flag = ATOMIC_FLAG_INIT; +}; + +static mluOpTensorDescriptorQueueStruct queue_array; #endif +} // anonymous namespace + /* MLUOP interface */ mluOpStatus_t MLUOP_WIN_API mluOpCreateTensorDescriptor(mluOpTensorDescriptor_t *desc) { PARAM_CHECK("[mluOpCreateTensorDescriptor]", desc != NULL); #if MLUOP_TENSOR_QUEUE_ENABLE - size_t id = hasher(std::this_thread::get_id()) % QUEUE_ARRAY_LENGTH; - queue_array[id].lock(); - if (MLUOP_PREDICT_FALSE(queue_array[id].queue.empty())) { - queue_array[id].extend(queue_array[id].extend_num); - queue_array[id].extend_num *= 2; + queue_array.lock(); + if MLUOP_PREDICT_FALSE (queue_array.queue.empty()) { + queue_array.extend(queue_array.extend_num); + queue_array.extend_num *= 2; } - *desc = queue_array[id].queue.front(); - queue_array[id].queue.pop(); - queue_array[id].unlock(); + *desc = ::new (queue_array.queue.front()) mluOpTensorStruct; + queue_array.queue.pop_front(); + queue_array.unlock(); #else - mluOpTensorStruct *ts = new (std::nothrow) mluOpTensorStruct(); + mluOpTensorStruct *ts = new (std::nothrow) mluOpTensorStruct; *desc = ts; #endif @@ -341,32 +362,29 @@ mluOpStatus_t MLUOP_WIN_API mluOpCreateGroupTensorDescriptors( PARAM_CHECK("[mluOpCreateGroupTensorDescriptors]", desc_num > 0); #if MLUOP_TENSOR_QUEUE_ENABLE - size_t id = hasher(std::this_thread::get_id()) % QUEUE_ARRAY_LENGTH; - queue_array[id].lock(); - if (MLUOP_PREDICT_FALSE(queue_array[id].queue.empty() || - (size_t)desc_num > - (size_t)queue_array[id].queue.size())) { - queue_array[id].extend( - std::max((size_t)queue_array[id].extend_num, (size_t)desc_num)); - queue_array[id].extend_num = - 2 * std::max((size_t)queue_array[id].extend_num, (size_t)desc_num); + queue_array.lock(); + if MLUOP_PREDICT_FALSE (queue_array.queue.size() < desc_num) { + queue_array.extend(std::max(queue_array.extend_num, (size_t)desc_num)); + queue_array.extend_num = + 2 * std::max(queue_array.extend_num, (size_t)desc_num); } for (int i = 0; i < desc_num; ++i) { - *(group_desc[i]) = queue_array[id].queue.front(); - queue_array[id].queue.pop(); + *(group_desc[i]) = queue_array.queue.front(); + queue_array.queue.pop_front(); } - queue_array[id].unlock(); + queue_array.unlock(); #else for (int i = 0; i < desc_num; ++i) { - mluOpTensorStruct *ts = new (std::nothrow) mluOpTensorStruct(); - *(group_desc[i]) = ts; + mluOpTensorStruct *ts = new (std::nothrow) mluOpTensorStruct; + group_desc[i][0] = ts; } #endif return MLUOP_STATUS_SUCCESS; } -mluOpStatus_t mluOpSetTensorDescriptorZeroDim(mluOpTensorDescriptor_t desc) { +static inline mluOpStatus_t mluOpSetTensorDescriptorZeroDim( + mluOpTensorDescriptor_t desc) { if (desc->pointer_mode == MLUOP_POINTER_MODE_HOST) { desc->dim = 0; desc->total_element_num = 1; @@ -422,32 +440,23 @@ mluOpStatus_t MLUOP_WIN_API mluOpSetTensorDescriptor_v2( } } -mluOpStatus_t mluOpSetTensorDescriptorDimBase(mluOpTensorDescriptor_t desc, - int dimNb, const void *dimSize) { - PARAM_CHECK("[mluOpSetTensorDescriptorDim]", desc != NULL); - PARAM_CHECK("[mluOpSetTensorDescriptorDim]", dimNb > 0); - PARAM_CHECK("[mluOpSetTensorDescriptorDim]", dimSize != NULL); - - desc->dim = dimNb; - - if (MLUOP_PREDICT_FALSE(desc->larger_dims != NULL)) { - delete[] desc->larger_dims; - desc->larger_dims = NULL; - } - if (MLUOP_PREDICT_FALSE(desc->larger_strides != NULL)) { - delete[] desc->larger_strides; - desc->larger_strides = NULL; - } - if (MLUOP_PREDICT_FALSE(dimNb > MLUOP_DIM_MAX)) { - desc->larger_dims = new (std::nothrow) int64_t[dimNb]; - desc->larger_strides = new (std::nothrow) int64_t[dimNb]; - desc->dims = desc->larger_dims; - desc->strides = desc->larger_strides; - } else { - desc->dims = desc->normal_dims; - desc->strides = desc->normal_strides; +// Internal interface. Caller should guarantee parameter validity. +static inline void mluOpSetTensorDescriptorDimBase(mluOpTensorDescriptor_t desc, + int dimNb) { + if (dimNb != desc->dim) { + if MLUOP_PREDICT_FALSE (desc->dims != desc->normal_dims) { + delete[] desc->dims; + delete[] desc->strides; + } + if MLUOP_PREDICT_FALSE (dimNb > MLUOP_DIM_MAX) { + desc->dims = new (std::nothrow) int64_t[dimNb]; + desc->strides = new (std::nothrow) int64_t[dimNb]; + } else { + desc->dims = desc->normal_dims; + desc->strides = desc->normal_strides; + } + desc->dim = dimNb; } - return MLUOP_STATUS_SUCCESS; } mluOpStatus_t MLUOP_WIN_API mluOpSetTensorDescriptorDim( @@ -456,8 +465,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpSetTensorDescriptorDim( CHECK_RETURN("[mluOpSetTensorDescriptorDim]", mluOpSetTensorDescriptorZeroDim(desc)); } else { - CHECK_RETURN("[mluOpSetTensorDescriptorDim]", - mluOpSetTensorDescriptorDimBase(desc, dimNb, (void *)dimSize)); + mluOpSetTensorDescriptorDimBase(desc, dimNb); std::copy(dimSize, dimSize + dimNb, desc->dims); } @@ -493,8 +501,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpSetTensorDescriptorDim( mluOpStatus_t MLUOP_WIN_API mluOpSetTensorDescriptorDim_v2( mluOpTensorDescriptor_t desc, int dimNb, const int64_t *dimSize) { - CHECK_RETURN("[mluOpSetTensorDescriptorDim]", - mluOpSetTensorDescriptorDimBase(desc, dimNb, (void *)dimSize)); + mluOpSetTensorDescriptorDimBase(desc, dimNb); memcpy(desc->dims, dimSize, dimNb * sizeof(int64_t)); @@ -541,34 +548,30 @@ mluOpStatus_t MLUOP_WIN_API mluOpSetGroupTensorDescriptors( int group_dimSize_iterator = 0; for (int i = 0; i < desc_num; ++i) { - (*(group_desc[i]))->dim = group_dimNb[i]; - (*(group_desc[i]))->dtype = group_dtype[i]; - (*(group_desc[i]))->layout = group_layout[i]; + group_desc[i][0]->dim = group_dimNb[i]; + group_desc[i][0]->dtype = group_dtype[i]; + group_desc[i][0]->layout = group_layout[i]; if (MLUOP_PREDICT_FALSE(group_dimNb[i] > MLUOP_DIM_MAX)) { - (*(group_desc[i]))->larger_dims = - new (std::nothrow) int64_t[group_dimNb[i]]; - (*(group_desc[i]))->larger_strides = - new (std::nothrow) int64_t[group_dimNb[i]]; - (*(group_desc[i]))->dims = (*(group_desc[i]))->larger_dims; - (*(group_desc[i]))->strides = (*(group_desc[i]))->larger_strides; + group_desc[i][0]->dims = new (std::nothrow) int64_t[group_dimNb[i]]; + group_desc[i][0]->strides = new (std::nothrow) int64_t[group_dimNb[i]]; } else { - (*(group_desc[i]))->dims = (*(group_desc[i]))->normal_dims; - (*(group_desc[i]))->strides = (*(group_desc[i]))->normal_strides; + group_desc[i][0]->dims = group_desc[i][0]->normal_dims; + group_desc[i][0]->strides = group_desc[i][0]->normal_strides; } std::copy(group_dimSize + group_dimSize_iterator, group_dimSize + group_dimSize_iterator + group_dimNb[i], - (*(group_desc[i]))->dims); + group_desc[i][0]->dims); // infer strides of dimNb dimensions and compute total_num and total_size int strideBase = 1; for (int j = group_dimNb[i] - 1; j >= 0; --j) { - (*(group_desc[i]))->strides[j] = strideBase; - strideBase *= (*(group_desc[i]))->dims[j]; + group_desc[i][0]->strides[j] = strideBase; + strideBase *= group_desc[i][0]->dims[j]; } - (*(group_desc[i]))->total_element_num = strideBase; - (*(group_desc[i]))->total_tensor_size = - (*(group_desc[i]))->total_element_num * + group_desc[i][0]->total_element_num = strideBase; + group_desc[i][0]->total_tensor_size = + group_desc[i][0]->total_element_num * mluop::getSizeOfDataType(group_dtype[i]); // compute new iterator for next loop. @@ -591,33 +594,29 @@ mluOpStatus_t MLUOP_WIN_API mluOpSetGroupTensorDescriptors_v2( int group_dimSize_iterator = 0; for (int i = 0; i < desc_num; ++i) { - (*(group_desc[i]))->dim = group_dimNb[i]; - (*(group_desc[i]))->dtype = group_dtype[i]; - (*(group_desc[i]))->layout = group_layout[i]; + group_desc[i][0]->dim = group_dimNb[i]; + group_desc[i][0]->dtype = group_dtype[i]; + group_desc[i][0]->layout = group_layout[i]; if (MLUOP_PREDICT_FALSE(group_dimNb[i] > MLUOP_DIM_MAX)) { - (*(group_desc[i]))->larger_dims = - new (std::nothrow) int64_t[group_dimNb[i]]; - (*(group_desc[i]))->larger_strides = - new (std::nothrow) int64_t[group_dimNb[i]]; - (*(group_desc[i]))->dims = (*(group_desc[i]))->larger_dims; - (*(group_desc[i]))->strides = (*(group_desc[i]))->larger_strides; + group_desc[i][0]->dims = new (std::nothrow) int64_t[group_dimNb[i]]; + group_desc[i][0]->strides = new (std::nothrow) int64_t[group_dimNb[i]]; } else { - (*(group_desc[i]))->dims = (*(group_desc[i]))->normal_dims; - (*(group_desc[i]))->strides = (*(group_desc[i]))->normal_strides; + group_desc[i][0]->dims = group_desc[i][0]->normal_dims; + group_desc[i][0]->strides = group_desc[i][0]->normal_strides; } - memcpy((*(group_desc[i]))->dims, group_dimSize + group_dimSize_iterator, + memcpy(group_desc[i][0]->dims, group_dimSize + group_dimSize_iterator, group_dimNb[i] * sizeof(int64_t)); // infer strides of dimNb dimensions and compute total_num and total_size int strideBase = 1; for (int j = group_dimNb[i] - 1; j >= 0; --j) { - (*(group_desc[i]))->strides[j] = strideBase; - strideBase *= (*(group_desc[i]))->dims[j]; + group_desc[i][0]->strides[j] = strideBase; + strideBase *= group_desc[i][0]->dims[j]; } - (*(group_desc[i]))->total_element_num = strideBase; - (*(group_desc[i]))->total_tensor_size = - (*(group_desc[i]))->total_element_num * + group_desc[i][0]->total_element_num = strideBase; + group_desc[i][0]->total_tensor_size = + group_desc[i][0]->total_element_num * mluop::getSizeOfDataType(group_dtype[i]); // compute new iterator for next loop. @@ -630,7 +629,28 @@ mluOpStatus_t MLUOP_WIN_API mluOpSetGroupTensorDescriptors_v2( mluOpStatus_t MLUOP_WIN_API mluOpResetTensorDescriptor(mluOpTensorDescriptor_t desc) { PARAM_CHECK("[mluOpResetTensorDescriptor]", desc != NULL); - desc->reset(); + + if MLUOP_PREDICT_FALSE (desc->dims != desc->normal_dims) { + delete[] desc->dims; + desc->dims = desc->normal_dims; + } + if MLUOP_PREDICT_FALSE (desc->strides != desc->normal_strides) { + delete[] desc->strides; + desc->strides = desc->normal_strides; + } + + desc->dim = 0; + desc->dtype = MLUOP_DTYPE_FLOAT; + desc->onchip_dtype = MLUOP_DTYPE_INVALID; + desc->layout = MLUOP_LAYOUT_ARRAY; + desc->pointer_mode = MLUOP_POINTER_MODE_DEVICE; + + desc->total_element_num = 0; + desc->total_tensor_size = 0; + + desc->position = 0; + desc->scale = 1.0f; + desc->offset = 0; return MLUOP_STATUS_SUCCESS; } @@ -653,7 +673,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpSetTensorDescriptorEx( PARAM_CHECK("[mluOpSetTensorDescriptorEx]", dimStride != NULL); PARAM_CHECK("[mluOpSetTensorDescriptorEx]", dimNb > 0); - mluOpSetTensorDescriptorDimBase(desc, dimNb, (void *)dimSize); + mluOpSetTensorDescriptorDimBase(desc, dimNb); std::copy(dimSize, dimSize + dimNb, desc->dims); std::copy(dimStride, dimStride + dimNb, desc->strides); @@ -680,14 +700,13 @@ mluOpStatus_t MLUOP_WIN_API mluOpSetTensorDescriptorEx_v2( desc->dtype = dtype; desc->layout = layout; - if (dimNb == 0) { + if MLUOP_PREDICT_FALSE (dimNb == 0) { return mluOpSetTensorDescriptorZeroDim(desc); } else { PARAM_CHECK("[mluOpSetTensorDescriptorEx]", dimSize != NULL); PARAM_CHECK("[mluOpSetTensorDescriptorEx]", dimStride != NULL); - PARAM_CHECK("[mluOpSetTensorDescriptorEx]", dimNb > 0); - mluOpSetTensorDescriptorDimBase(desc, dimNb, (void *)dimSize); + mluOpSetTensorDescriptorDimBase(desc, dimNb); memcpy(desc->dims, dimSize, dimNb * sizeof(int64_t)); memcpy(desc->strides, dimStride, dimNb * sizeof(int64_t)); @@ -872,13 +891,12 @@ mluOpStatus_t MLUOP_WIN_API mluOpGetTensorDescriptorPointerMode( mluOpStatus_t MLUOP_WIN_API mluOpDestroyTensorDescriptor(mluOpTensorDescriptor_t desc) { PARAM_CHECK("[mluOpDestroyTensorDescriptor]", desc != NULL); - desc->reset(); #if MLUOP_TENSOR_QUEUE_ENABLE - size_t id = hasher(std::this_thread::get_id()) % QUEUE_ARRAY_LENGTH; - queue_array[id].lock(); - queue_array[id].queue.emplace(desc); - queue_array[id].unlock(); + queue_array.lock(); + desc->~mluOpTensorStruct(); + queue_array.queue.push_front(desc); + queue_array.unlock(); #else delete desc; #endif @@ -892,17 +910,15 @@ mluOpStatus_t MLUOP_WIN_API mluOpDestroyGroupTensorDescriptors( PARAM_CHECK("[mluOpDestroyGroupTensorDescriptors]", desc_num > 0); #if MLUOP_TENSOR_QUEUE_ENABLE - size_t id = hasher(std::this_thread::get_id()) % QUEUE_ARRAY_LENGTH; - queue_array[id].lock(); + queue_array.lock(); for (int i = 0; i < desc_num; ++i) { - (*(group_desc[i]))->reset(); - queue_array[id].queue.emplace(*(group_desc[i])); + group_desc[i][0]->~mluOpTensorStruct(); + queue_array.queue.push_front(group_desc[i][0]); } - queue_array[id].unlock(); + queue_array.unlock(); #else for (int i = 0; i < desc_num; ++i) { - (*(group_desc[i]))->reset(); - delete (*(group_desc[i])); + delete group_desc[i][0]; } #endif @@ -913,9 +929,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpDestroyGroupTensorDescriptors( uint64_t MLUOP_WIN_API mluOpGetTensorElementNum(const mluOpTensorDescriptor_t desc) { CHECK(desc != NULL); - uint64_t tensor_num = 1; - auto return_status = desc->tensorElementsNumber(tensor_num); - return tensor_num; + return desc->total_element_num; } uint64_t mluOpGetSeqDataElementNum(mluOpSeqDataDescriptor_t desc) { diff --git a/core/tensor.h b/core/tensor.h index e237a15f6..38fba2a67 100644 --- a/core/tensor.h +++ b/core/tensor.h @@ -38,101 +38,103 @@ #define QUEUE_ARRAY_LENGTH 4 -struct mluOpTensorStruct { - mluOpTensorStruct() - : dim(0), - dtype(MLUOP_DTYPE_FLOAT), - onchip_dtype(MLUOP_DTYPE_INVALID), - layout(MLUOP_LAYOUT_ARRAY), - pointer_mode(MLUOP_POINTER_MODE_DEVICE), - position(0), - scale(1.0), - offset(0) { - /* explicit set initial values for document use. - */ - } +struct alignas(64) mluOpTensorStruct { + /** default constructor */ + mluOpTensorStruct() = default; + + /** copy constructor */ + mluOpTensorStruct(mluOpTensorStruct const &other) { *this = other; } + + /** move constructor */ + mluOpTensorStruct(mluOpTensorStruct const &&) = delete; + + /** destructor */ ~mluOpTensorStruct() { - /* please do NOT implement any codes here. - * a state-less struct should not hold any resources. - */ + if MLUOP_PREDICT_FALSE (dims != normal_dims) { + delete[] dims; + } + if MLUOP_PREDICT_FALSE (strides != normal_strides) { + delete[] strides; + } } + + /** copy assignment operator */ + mluOpTensorStruct &operator=(mluOpTensorStruct const &other) { + if (dim > MLUOP_DIM_MAX && (dim < other.dim || other.dim < MLUOP_DIM_MAX)) { + delete[] dims; + delete[] strides; + if (other.dim < MLUOP_DIM_MAX) { + dims = normal_dims; + strides = normal_strides; + } else { + dims = new (std::nothrow) int64_t[dim]; + strides = new (std::nothrow) int64_t[dim]; + } + } + + dim = other.dim; + dtype = other.dtype; + layout = other.layout; + onchip_dtype = other.onchip_dtype; + pointer_mode = other.pointer_mode; + + total_element_num = other.total_element_num; + total_tensor_size = other.total_tensor_size; + + memcpy(dims, other.dims, sizeof(int64_t) * dim); + memcpy(strides, other.strides, sizeof(int64_t) * dim); + + position = other.position; + scale = other.scale; + offset = other.offset; + + positions = other.positions; + scales = other.scales; + offsets = other.offsets; + + return *this; + } + + mluOpTensorStruct &operator=(mluOpTensorStruct const &&other) = delete; + /* methods */ mluOpStatus_t tensorDimN(size_t &dim); mluOpStatus_t tensorDimC(size_t &dim); mluOpStatus_t tensorDimH(size_t &dim); mluOpStatus_t tensorDimW(size_t &dim); - inline mluOpStatus_t tensorElementsNumber(size_t &elements) const { - elements = total_element_num; - return MLUOP_STATUS_SUCCESS; - } - inline mluOpStatus_t tensorSize(size_t &tensor_size) const { - tensor_size = total_tensor_size; - return MLUOP_STATUS_SUCCESS; - } + inline bool isSameDims(const mluOpTensorStruct &other) const; inline bool isSameDims(const mluOpTensorStruct *other) const; inline bool isCpuScalar() const; - /* struct */ - int dim = 0; + /* Try to pack and align the struct */ + /* ------------------- 64 Bytes - 1 -------------------*/ + int64_t normal_dims[MLUOP_DIM_MAX]; + + /* ------------------- 64 Bytes - 2 -------------------*/ + int64_t normal_strides[MLUOP_DIM_MAX]; + + /* ------------------- 64 Bytes - 3 -------------------*/ + /* Offset - 0 */ uint64_t total_element_num = 0; uint64_t total_tensor_size = 0; - // if dimNb > MLUOP_DIM_MAX (8), using larger_dims, malloc it and dims point - // it. else, using normal_dims, dont need malloc and free. - int64_t normal_dims[MLUOP_DIM_MAX] = {-1}; - int64_t *larger_dims = NULL; - int64_t *dims = normal_dims; // point the normal dims as default - - int64_t normal_strides[MLUOP_DIM_MAX] = {-1}; - int64_t *larger_strides = NULL; + int64_t *dims = normal_dims; // point the normal dims as default int64_t *strides = normal_strides; // point the normal strides as default - - mluOpDataType_t dtype; - mluOpDataType_t onchip_dtype; - mluOpTensorLayout_t layout; - mluOpPointerMode_t pointer_mode; - int position; - float scale; - int offset; + /* Offset - 32 */ + int dim = 0; + mluOpDataType_t dtype = MLUOP_DTYPE_FLOAT; + mluOpDataType_t onchip_dtype = MLUOP_DTYPE_INVALID; + mluOpTensorLayout_t layout = MLUOP_LAYOUT_ARRAY; + mluOpPointerMode_t pointer_mode = MLUOP_POINTER_MODE_DEVICE; + + /* Offset - 52 */ + /* To be removed*/ + int position = 0; + float scale = 1; + int offset = 0; std::vector positions; std::vector scales; std::vector offsets; - inline void init() { // reset random value after malloc. - // init these pointer. - // if not, when call reset() will free invalid pointer. - larger_dims = NULL; - larger_strides = NULL; - - dim = 0; - total_element_num = 0; - total_tensor_size = 0; - dims = normal_dims; - strides = normal_strides; - } - inline void reset() { // reset variable as default. - if (MLUOP_PREDICT_FALSE(larger_dims != NULL)) { - delete[] larger_dims; - larger_dims = NULL; - } - if (MLUOP_PREDICT_FALSE(larger_strides != NULL)) { - delete[] larger_strides; - larger_strides = NULL; - } - dims = normal_dims; - strides = normal_strides; - dtype = MLUOP_DTYPE_FLOAT; - onchip_dtype = MLUOP_DTYPE_INVALID; - layout = MLUOP_LAYOUT_ARRAY; - pointer_mode = MLUOP_POINTER_MODE_DEVICE; - - position = 0; - scale = 1.0f; - offset = 0; - - dim = 0; - total_element_num = 0; - total_tensor_size = 0; - } }; // dim_set(rnn) [layer_num, direction, cap_of_cell] @@ -156,9 +158,7 @@ struct mluOpTensorSetStruct { CHECK(!this->tensor_set.empty()); size_t tensor_set_size = 0; for (int i = 0; i < tensor_set.size(); i++) { - size_t size = 0; - tensor_set[i]->tensorSize(size); - tensor_set_size += size; + tensor_set_size += tensor_set[i]->total_tensor_size; } return tensor_set_size; } @@ -175,9 +175,7 @@ struct mluOpTensorSetStruct { int64_t offset = 0; int index = this->getIndex(tensorIndex); for (int i = 0; i < index; i++) { - size_t ts_size = 0; - this->tensor_set[i]->tensorSize(ts_size); - offset += ts_size; + offset += tensor_set[i]->total_tensor_size; } data_offset[index] = offset; return offset; @@ -220,9 +218,7 @@ struct mluOpTensorSetStruct { int offset = 0; data_offset[0] = offset; for (int i = 0; i < tensor_num - 1; i++) { - size_t ts_size = 0; - this->tensor_set[i]->tensorSize(ts_size); - offset += ts_size; + offset += tensor_set[i]->total_tensor_size; data_offset[i + 1] = offset; } return data_offset; @@ -247,7 +243,7 @@ struct mluOpSeqDataStruct { position(0), scale(1.0), offset(0), - padding_fill(NULL) { + padding_fill(nullptr) { /* explicit set initial values for document use. */ } @@ -287,48 +283,6 @@ struct mluOpSeqDataStruct { void *padding_fill; }; -#ifndef MLUOP_TENSOR_QUEUE_ENABLE -#define MLUOP_TENSOR_QUEUE_ENABLE 1 -#endif - -#if MLUOP_TENSOR_QUEUE_ENABLE -struct mluOpTensorDescriptorQueueStruct { - mluOpTensorDescriptorQueueStruct() { - extend(extend_num); - extend_num *= 2; - } - explicit mluOpTensorDescriptorQueueStruct(size_t n) { - extend_num = n; - extend(extend_num); - extend_num *= 2; - } - ~mluOpTensorDescriptorQueueStruct() { - for (auto it : this->headers) { - delete[] it; - } - } - std::queue queue; - std::list headers; - std::atomic_flag flag = ATOMIC_FLAG_INIT; - inline void lock() { - while (flag.test_and_set(std::memory_order_acquire)) { - std::this_thread::yield(); - } - } - inline void unlock() { flag.clear(std::memory_order_release); } - inline void extend(size_t n) { - mluOpTensorStruct *header = new (std::nothrow) mluOpTensorStruct[n]; - headers.emplace_back(header); - for (size_t i = 0; i < n; ++i) { - mluOpTensorStruct *desc = header + i; - desc->init(); // reset random value. - queue.emplace(desc); - } - } - size_t extend_num = 100; -}; -#endif - inline int mluOpDataTypeBytes(const mluOpDataType_t dt) { return mluop::getSizeOfDataType(dt); } diff --git a/core/type.cpp b/core/type.cpp index f135ca057..a6d2d8a88 100644 --- a/core/type.cpp +++ b/core/type.cpp @@ -84,36 +84,4 @@ std::string MLUOP_WIN_API MLUOP_ATTRIBUTE_FLATTEN getNameOfTensorLayout(mluOpTen return mluOpGetNameOfTensorLayout(layout); } -size_t getSizeOfDataType(mluOpDataType_t dtype) { - switch (dtype) { - default: { - return 0; - } - case MLUOP_DTYPE_BOOL: - case MLUOP_DTYPE_INT8: - case MLUOP_DTYPE_UINT8: { - return 1; - } - case MLUOP_DTYPE_INT16: - case MLUOP_DTYPE_UINT16: - case MLUOP_DTYPE_HALF: - case MLUOP_DTYPE_BFLOAT16: { - return 2; - } - case MLUOP_DTYPE_INT31: - case MLUOP_DTYPE_INT32: - case MLUOP_DTYPE_UINT32: - case MLUOP_DTYPE_FLOAT: - case MLUOP_DTYPE_COMPLEX_HALF: { - return 4; - } - case MLUOP_DTYPE_UINT64: - case MLUOP_DTYPE_INT64: - case MLUOP_DTYPE_DOUBLE: - case MLUOP_DTYPE_COMPLEX_FLOAT: { - return 8; - } - } -} - } // namespace mluop diff --git a/core/type.h b/core/type.h index bfabbd21d..46122e86b 100644 --- a/core/type.h +++ b/core/type.h @@ -52,9 +52,38 @@ static mluOpStatus_t getLowAndHighValueFrom64Bits(T value, uint32_t* high, return MLUOP_STATUS_SUCCESS; } -// TODO(None) use const char * instead of std::string (need mluop_extra -// fix first) hide visibility -size_t MLUOP_WIN_API getSizeOfDataType(mluOpDataType_t dtype); +static inline size_t MLUOP_WIN_API getSizeOfDataType(mluOpDataType_t dtype) { + switch (dtype) { + default: { + return 0; + } + case MLUOP_DTYPE_BOOL: + case MLUOP_DTYPE_INT8: + case MLUOP_DTYPE_UINT8: { + return 1; + } + case MLUOP_DTYPE_INT16: + case MLUOP_DTYPE_UINT16: + case MLUOP_DTYPE_HALF: + case MLUOP_DTYPE_BFLOAT16: { + return 2; + } + case MLUOP_DTYPE_INT31: + case MLUOP_DTYPE_INT32: + case MLUOP_DTYPE_UINT32: + case MLUOP_DTYPE_FLOAT: + case MLUOP_DTYPE_COMPLEX_HALF: { + return 4; + } + case MLUOP_DTYPE_UINT64: + case MLUOP_DTYPE_INT64: + case MLUOP_DTYPE_DOUBLE: + case MLUOP_DTYPE_COMPLEX_FLOAT: { + return 8; + } + } +} + std::string MLUOP_WIN_API getNameOfDataType(mluOpDataType_t dtype); // NOLINT std::string MLUOP_WIN_API getNameOfTensorLayout(mluOpTensorLayout_t layout); // NOLINT diff --git a/kernels/ball_query/ball_query.cpp b/kernels/ball_query/ball_query.cpp index a76ac7342..f3bbf4fa2 100644 --- a/kernels/ball_query/ball_query.cpp +++ b/kernels/ball_query/ball_query.cpp @@ -51,11 +51,7 @@ void policyFuncBallQuery(const mluOpHandle_t &handle, size_t core_in_cluster = handle->core_num_per_cluster; VLOG(5) << "In current device, core_in_cluster:" << core_in_cluster; - size_t total_data_num; - if (desc->tensorElementsNumber(total_data_num) != MLUOP_STATUS_SUCCESS) { - LOG(ERROR) << "[mluOpBallQuery], In policyFuncBallQuery function, fail to " - "get elem_count"; - } + size_t total_data_num = desc->total_element_num; // On a core, a lot of new_xyz data element can be stored; but only one data // element can be processed at a time. So a cluster can only process four data diff --git a/kernels/sparse_conv/get_indice_pairs/normal_get_indice_pairs.cpp b/kernels/sparse_conv/get_indice_pairs/normal_get_indice_pairs.cpp index 7dacdc153..ca2fecd21 100644 --- a/kernels/sparse_conv/get_indice_pairs/normal_get_indice_pairs.cpp +++ b/kernels/sparse_conv/get_indice_pairs/normal_get_indice_pairs.cpp @@ -39,8 +39,8 @@ static mluOpStatus_t getIndiceMaskAll( const mluOpTensorDescriptor_t indice_pairs_desc, const int kernel_volume, const int input_active_site, size_t *size) { size_t total_size = 0; - total_size = - kernel_volume * input_active_site * sizeof(indice_pairs_desc->dtype); + total_size = kernel_volume * input_active_site * + mluop::getSizeOfDataType(indice_pairs_desc->dtype); size[0] = total_size; return MLUOP_STATUS_SUCCESS; } @@ -50,7 +50,8 @@ static mluOpStatus_t getIndiceIndexIn( const int input_active_site, size_t *size) { size_t total_size = 0; total_size = - kernel_volume * input_active_site * sizeof(indice_pairs_desc->dtype); + kernel_volume * input_active_site * + mluop::getSizeOfDataType(indice_pairs_desc->dtype); size[0] = total_size; return MLUOP_STATUS_SUCCESS; } @@ -60,7 +61,8 @@ static mluOpStatus_t getIndiceIndexOut( const int input_active_site, size_t *size) { size_t total_size = 0; total_size = - kernel_volume * input_active_site * sizeof(indice_pairs_desc->dtype); + kernel_volume * input_active_site * + mluop::getSizeOfDataType(indice_pairs_desc->dtype); size[0] = total_size; return MLUOP_STATUS_SUCCESS; } @@ -70,7 +72,8 @@ static mluOpStatus_t getIndiceOutExpand( const int input_active_site, size_t *size) { size_t total_size = 0; total_size = - kernel_volume * input_active_site * sizeof(indice_pairs_desc->dtype); + kernel_volume * input_active_site * + mluop::getSizeOfDataType(indice_pairs_desc->dtype); size[0] = total_size; return MLUOP_STATUS_SUCCESS; } @@ -79,7 +82,8 @@ static mluOpStatus_t getIndiceInExpand( const mluOpTensorDescriptor_t indice_pairs_desc, const int input_active_site, size_t *size) { size_t total_size = 0; - total_size = input_active_site * sizeof(indice_pairs_desc->dtype); + total_size = input_active_site * + mluop::getSizeOfDataType(indice_pairs_desc->dtype); size[0] = total_size; return MLUOP_STATUS_SUCCESS; } @@ -89,7 +93,7 @@ static mluOpStatus_t getIndiceUnique( const int input_active_site, size_t *size) { size_t total_size = 0; total_size = (kernel_volume * input_active_site + 1) * - sizeof(indice_pairs_desc->dtype); + mluop::getSizeOfDataType(indice_pairs_desc->dtype); size[0] = total_size; return MLUOP_STATUS_SUCCESS; } @@ -97,7 +101,7 @@ static mluOpStatus_t getIndiceUnique( static mluOpStatus_t getGridOut(const mluOpTensorDescriptor_t indice_pairs_desc, int output_size, size_t *size) { size_t total_size = 0; - total_size = output_size * sizeof(indice_pairs_desc->dtype); + total_size = output_size * mluop::getSizeOfDataType(indice_pairs_desc->dtype); size[0] = total_size; return MLUOP_STATUS_SUCCESS; }