diff --git a/.github/ci_script/combine_log.py b/.github/ci_script/combine_log.py deleted file mode 100644 index 05a8a973c..000000000 --- a/.github/ci_script/combine_log.py +++ /dev/null @@ -1,46 +0,0 @@ -import time -import sys -import os -''' - Get info. - output_path: the target file that you want to combine sub log with. - list_path: the list of sub log name. When it is updated, the correspondding file will be add to output tail. - list_dir_path: the dir path where sub logs stored. - status_path: the path of status file. When status file is written to "success" or "fail", exit script. -''' - -output_path = sys.argv[1] -list_path = sys.argv[2] -list_dir_path = sys.argv[3] -status_path = sys.argv[4] - -if __name__ == '__main__': - # list_pos stores the last position that pointer of list file pointed to. - list_pos = 0 - while True: - list_file = open(list_path, 'r') - list_file.seek(list_pos) - # read all lines starting from list_pos. - items = list_file.readlines() - # update list_pos - list_pos = list_file.tell() - # if read any line - if items is not None: - items.sort() - for item in items: - sub_path = item.strip() - if sub_path != "": - file_name = list_dir_path + '/' + sub_path - # while True: - if os.path.exists(file_name): - os.system('cat ' + file_name + ' >> ' + output_path) - # break - # check status_file, when read "success" or "fail" exit cycle, or else, sleep some seconds and start from beginning. - status_file = open(status_path) - status = status_file.readline().strip() - status_file.close() - if "fail" in status or "success" in status or "Success" in status or "Fail" in status or "error" in status or "Error" in status: - break - else: - time.sleep(2) - diff --git a/.github/ci_script/file_guard.py b/.github/ci_script/file_guard.py deleted file mode 100644 index 527f86b9e..000000000 --- a/.github/ci_script/file_guard.py +++ /dev/null @@ -1,32 +0,0 @@ -import time -import sys -import os -guard_status_file = sys.argv[1] -guard_log_file = sys.argv[2] - -if __name__ == '__main__': - # where stores the last position that pointer pointed to. - where= 0 - while True: - file = open(guard_log_file, "r") - file.seek(where) - # if read any lines, call system echo to print each line. - for line in file.readlines(): - new_line = line.strip().replace("\'", "_").replace("\"", "_") - os.system('echo ' + "'%s'" % new_line) - # update where - where = file.tell() - file.close() - # check status, end process when read "success" or "fail" - status_file = open(guard_status_file, "r") - line = status_file.readline().strip() - status_file.close() - if "success" in line or "Success" in line: - print("Task success.") - break - elif "fail" in line or "Fail" in line or "error" in line or "Error" in line: - print("Task Fail.") - exit(-1) - # sleep for a while - time.sleep(2) - diff --git a/.github/ci_script/mlu-ops-all_system_test_script.sh b/.github/ci_script/mlu-ops-all_system_test_script.sh deleted file mode 100644 index 695ac36a6..000000000 --- a/.github/ci_script/mlu-ops-all_system_test_script.sh +++ /dev/null @@ -1,77 +0,0 @@ -# /bin/bash -# get PR id -pr_id="ALLSYSTEM" - -# generate time stamp -current=`date "+%Y-%m-%d %H:%M:%S"` -timeStamp=`date -d "$current" +%s` -currentTimeStamp=$((timeStamp*1000+10#`date "+%N"`/1000000)) - -# temporally set to mlu370 -card_type="MLU370-S4" - -# default repo name -repo_name="mlu-ops" - -# repo ci root path -repo_root="~/${repo_name}_ci/" -if [ ! -d $repo_root ];then - mkdir $repo_root -fi - -# repo ci requests path -requests_path="$repo_root/requests" -if [ ! -d $requests_path ];then - mkdir $requests_path -fi - -# gen name of this ci -request_name="${repo_name}_${pr_id}_${currentTimeStamp}_${card_type}" - -# gen file and dir for this request -request_root="$repo_root/$request_name/" -sub_logs_path="$request_root/sub_logs/" - -if [ ! -d $request_root ];then - mkdir $request_root -fi - -if [ ! -d $sub_logs_path ];then - mkdir $sub_logs_path -fi - -echo "working" > "$request_root/status" -chmod o+w "$request_root/status" - -if [ ! -f "$request_root/log" ];then - touch "$request_root/log" -fi - -chmod o+w "$request_root/log" - -if [ ! -f "$request_root/log_list" ];then - touch "$request_root/log_list" -fi - -chmod o+w "$request_root/log_list" - -# gen request file. -echo "${repo_name},${pr_id},${currentTimeStamp},${card_type}" > "$requests_path/${request_name}" - -# start script -python3 .github/ci_script/file_guard.py "$request_root/status" "$request_root/log" & -python3 .github/ci_script/combine_log.py "$request_root/log" "$request_root/log_list" "$request_root/sub_logs" "$request_root/status" & - -wait - -status=$( head -n +1 ${request_root}/status ) - -if [ "$status" != "success" ];then - return_info=$( sed -n 2p ${request_root}/status ) - echo "${return_info}" - exit -1 -else - return_info=$( sed -n 2p ${request_root}/status ) - echo "${return_info}" - exit 0 -fi diff --git a/.github/ci_script/mlu-ops-ci_script.sh b/.github/ci_script/mlu-ops-ci_script.sh deleted file mode 100644 index 1b9103d95..000000000 --- a/.github/ci_script/mlu-ops-ci_script.sh +++ /dev/null @@ -1,93 +0,0 @@ -# /bin/bash -# get PR id -PR_string=$(echo $GITHUB_REF | grep -Eo "/[0-9]*/") -pr_id=(${PR_string//// }) - -# generate time stamp -current=`date "+%Y-%m-%d %H:%M:%S"` -timeStamp=`date -d "$current" +%s` -currentTimeStamp=$((timeStamp*1000+10#`date "+%N"`/1000000)) - -# temporally set to mlu370 -card_type="MLU370-S4" - -# default repo name -repo_name="mlu-ops" - -# repo ci root path -repo_root="/home/user/${repo_name}_ci/" -if [ ! -d $repo_root ];then - mkdir $repo_root -fi -# repo ci requests path -requests_path="$repo_root/requests" -if [ ! -d $requests_path ];then - mkdir $requests_path -fi - -# gen name of this ci -request_name="${repo_name}_${pr_id}_${currentTimeStamp}_${card_type}.rqt" - -# gen file and dir for this request -request_root="$repo_root/$request_name/" -sub_logs_path="$request_root/sub_logs/" - - -# echo "${repo_root}" -# echo "${requests_path}" -# echo "${request_root}" - -if [ ! -d $request_root ];then - mkdir $request_root -fi - -if [ ! -d $sub_logs_path ];then - mkdir $sub_logs_path -fi - -echo "working" > "$request_root/status" -chmod o+w "$request_root/status" - -if [ ! -f "$request_root/log" ];then - touch "$request_root/log" -fi - -chmod o+w "$request_root/log" - -if [ ! -f "$request_root/log_list" ];then - touch "$request_root/log_list" -fi - -chmod o+w "$request_root/log_list" - -# gen request file. -# echo "${repo_name},${pr_id},${currentTimeStamp},${card_type}" > "$requests_path/${request_name}" - -echo "repo:${repo_name}" > "$requests_path/${request_name}" - -echo "pr_id:${pr_id}" >> "$requests_path/${request_name}" - -echo "timestamp:${currentTimeStamp}" >> "$requests_path/${request_name}" - -# change dir group for server and client, or when server/client try to delete request, ftp may raise error. - -#chgrp -R ftpuser $request_root -#chgrp -R ftpuser $requests_path - -# start script -python3 .github/ci_script/file_guard.py "$request_root/status" "$request_root/log" & -python3 .github/ci_script/combine_log.py "$request_root/log" "$request_root/log_list" "$request_root/sub_logs" "$request_root/status" & - -wait - -# status=$(cat ${request_root}/status) - -status=$( head -n +1 ${request_root}/status ) - -if [ "$status" != "success" ];then - cat ${request_root}/status - exit -1 -else - cat ${request_root}/status - exit 0 -fi diff --git a/.github/scripts/invoke_ci_test.sh b/.github/scripts/invoke_ci_test.sh new file mode 100644 index 000000000..bf3d55407 --- /dev/null +++ b/.github/scripts/invoke_ci_test.sh @@ -0,0 +1,20 @@ +# /bin/bash +# get PR id +github_ref=$(echo $GITHUB_REF | grep -Eo "/[0-9]*/") +pr_id=(${github_ref//// }) + +# generate time stamp +current_time=`date "+%Y-%m-%d %H:%M:%S"` +timestamp_string=`date -d "${current_time}" +%s` +current_timestamp=$((timestamp_string*1000+10#`date "+%N"`/1000000)) + +# temporally set to mlu370 +card_type="MLU370-S4" + +# default repo name +repo_name="mlu-ops" + +github_user=${GITHUB_ACTOR} + +# start script +python3 .github/scripts/run_ci_test.py ${repo_name} ${github_user} ${pr_id} ${current_timestamp} \ No newline at end of file diff --git a/.github/scripts/run_ci_test.py b/.github/scripts/run_ci_test.py new file mode 100644 index 000000000..7881d9930 --- /dev/null +++ b/.github/scripts/run_ci_test.py @@ -0,0 +1,60 @@ +import requests +import json +import time +import sys + +local_communication_port = 12547 + +params = sys.argv +try: + if len(params) == 5: + repo = params[1] + user = params[2] + pr_id = params[3] + timestamp = params[4] + elif len(params) == 4: + repo = params[1] + user = params[2] + pr_id = "" + timestamp = params[3] + else: + print("Got some wrong with input params. Test fail.") + exit(-1) +except Exception as e: + print("Got some wrong with input params. Test fail.") + exit(-1) + +json_obj = { + "timestamp": timestamp, + "repo": repo, + "pr_id": pr_id, + "trigger_type": "ci", + "trigger_id": user, + "repeat_times": "3", + "status": "running" +} +local_test_server = "http://localhost:" + str(local_communication_port) + +# invoke test +response = requests.post(local_test_server, json=json_obj) +# get internal id +task_obj = json.loads(response.text) + +try: + while 1: + response = requests.get(local_test_server + "/aiming=get_status&id=" + task_obj["id"]) + result = json.loads(response.text) + if "success" in result["status"] or "fail" in result["status"] or "error" in result["status"] or "stable" in result["status"]: + print(result["log"]) + print(result["status"]) + response = requests.get(local_test_server + "/aiming=end_job&id=" + task_obj["id"]) + if "success" in result["status"]: + exit(0) + else: + exit(-1) + break + time.sleep(10) +except Exception as e: + print(e) + print("Got internal error while invoking test. Since we can not reboot this test, you should rerun this test in github.") + exit(-1) diff --git a/.github/workflows/mluops_all_system_ci.discard b/.github/workflows/mluops_all_system_ci.discard deleted file mode 100644 index a63f73d96..000000000 --- a/.github/workflows/mluops_all_system_ci.discard +++ /dev/null @@ -1,59 +0,0 @@ -name: mluops_all_system_test - -on: - push: - branches: [master, r*] - paths: - - 'CMakeLists.txt' - - 'independent_build.sh' - - 'build.sh' - - 'build.property' - - 'test/mlu_op_gtest/CMakeLists.txt' - - 'cmake/*' - - 'CMakeLists.txt' - tags: - - v* - pull_request: - branches: [master, r*] - paths: - - '.github/workflows/mluops_all_system_ci.yaml' - - 'CMakeLists.txt' - - 'independent_build.sh' - - 'build.sh' - - 'build.property' - - 'test/mlu_op_gtest/CMakeLists.txt' - - 'cmake/*' - - 'CMakeLists.txt' - -jobs: - test: - strategy: - matrix: - runner: [mlu370-m8] - mlu_ops_version : [1.3.0] - cntoolkit_version : [3.14.0] - cnnl_version: [1.26.6] - os: [ubuntu20.04, centos7, centos8, kylin10, ubuntu22.04] - runs-on: ${{matrix.runner}} - steps: - - uses: actions/checkout@v3 - with: - submodules: 'true' - - - name: pull_images - run: | - docker pull docker-user.extrotec.com:30080/mlu-ops/mluops_ci:devel-x86_64-${{matrix.os}}-cntoolkit${{matrix.cntoolkit_version}}-cnnl${{matrix.cnnl_version}} - - - name: build_mlu_ops - run: > - docker run --rm -v $(pwd):/work -w /work docker-user.extrotec.com:30080/mlu-ops/mluops_ci:devel-x86_64-${{matrix.os}}-cntoolkit${{matrix.cntoolkit_version}}-cnnl${{matrix.cnnl_version}} - ./build.sh - - - name: mlu_ops_version_check - run: > - docker run --rm -v $(pwd):/work -w /work docker-user.extrotec.com:30080/mlu-ops/mluops_ci:devel-x86_64-${{matrix.os}}-cntoolkit${{matrix.cntoolkit_version}}-cnnl${{matrix.cnnl_version}} - bash version_check.sh ${{matrix.mlu_ops_version}} - - - name: clean - run: | - rm -rf build diff --git a/.github/workflows/mluops_ci.yaml b/.github/workflows/mluops_ci.yaml index 3b010944c..09cd46549 100644 --- a/.github/workflows/mluops_ci.yaml +++ b/.github/workflows/mluops_ci.yaml @@ -48,4 +48,4 @@ jobs: - name: run_mlu_ops_ci run: > - bash .github/ci_script/mlu-ops-ci_script.sh + bash .github/scripts/invoke_ci_test.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index c7d1909b2..96611860e 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,6 +57,7 @@ if(${MLUOP_BUILD_ASAN_CHECK} MATCHES "ON") endif() # -- BANG memcheck +#TODO remove this option after cntoolkit upgraded to 4.0 if(${MLUOP_BUILD_BANG_MEMCHECK} MATCHES "ON") message("-- BANG memcheck enabled") set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -mllvm -enable-mlisa-sanitizer") @@ -89,6 +90,36 @@ if(NOT BANG_FOUND) elseif (NOT BANG_CNCC_EXECUTABLE) message(FATAL_ERROR "cncc not found, please ensure cncc is in your PATH env or set variable BANG_CNCC_EXECUTABLE from cmake. Otherwise you should check path used by find_program(BANG_CNCC_EXECUTABLE) in FindBANG.cmake") endif() +message(STATUS "BANG_CNCC_EXECUTABLE=${BANG_CNCC_EXECUTABLE}") +execute_process( + COMMAND ${BANG_CNCC_EXECUTABLE} --version + COMMAND head -n1 + COMMAND awk "{print $2}" + COMMAND sed "s/^v//g" + OUTPUT_VARIABLE _cncc_version + OUTPUT_STRIP_TRAILING_WHITESPACE +) +message(STATUS "cncc version ${_cncc_version}") +execute_process( + COMMAND echo ${_cncc_version} + COMMAND cut -d "." -f1 + OUTPUT_VARIABLE _cncc_version_major + OUTPUT_STRIP_TRAILING_WHITESPACE +) +execute_process( + COMMAND echo ${_cncc_version} + COMMAND cut -d "." -f2 + OUTPUT_VARIABLE _cncc_version_minor + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +if (NOT "${_cncc_version}" VERSION_LESS "4.15.0") + set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -mllvm --fmlu-memintr-warning=true") +endif() + +if(NOT "${_cncc_version}" VERSION_LESS "4.1.0") + set(BANG_CNCC_FLAGS "${BANG_CNCC_FLAGS} -mllvm --fmlu-addrspace-warning") +endif() find_package(fmt REQUIRED) # setup cncc flags diff --git a/core/context.h b/core/context.h index 799a00ce2..ab9fa9cae 100644 --- a/core/context.h +++ b/core/context.h @@ -96,17 +96,17 @@ struct mluOpContext { switch (function_type) { default: return 0; - case CNRT_FUNC_TYPE_BLOCK: + case cnrtFuncTypeBlock: return job_num[0]; - case CNRT_FUNC_TYPE_UNION1: + case cnrtFuncTypeUnion1: return job_num[1]; - case CNRT_FUNC_TYPE_UNION2: + case cnrtFuncTypeUnion2: return job_num[2]; - case CNRT_FUNC_TYPE_UNION4: + case cnrtFuncTypeUnion4: return job_num[3]; - case CNRT_FUNC_TYPE_UNION8: + case cnrtFuncTypeUnion8: return job_num[4]; - case CNRT_FUNC_TYPE_UNION16: + case cnrtFuncTypeUnion16: return job_num[5]; } } diff --git a/core/gen_case.h b/core/gen_case.h index ccb867ab0..e815c29ff 100644 --- a/core/gen_case.h +++ b/core/gen_case.h @@ -472,9 +472,9 @@ class PbNode { void *data = malloc(data_size); auto memcpy_dir = (tensors[index].desc->pointer_mode == MLUOP_POINTER_MODE_HOST - ? CNRT_MEM_TRANS_DIR_HOST2HOST - : CNRT_MEM_TRANS_DIR_DEV2HOST); - if (CNRT_RET_SUCCESS == + ? cnrtMemcpyHostToHost + : cnrtMemcpyDevToHost); + if (cnrtSuccess == cnrtMemcpy(data, const_cast(tensors[index].device_ptr), data_size, memcpy_dir)) { return data; @@ -538,9 +538,9 @@ inline void PbNode::appendOpParam(std::string param_name, int data_width = mluop::getSizeOfDataType(dtype); if (attr.type == cnrtMemTypeDevice) { void *data = malloc(data_width); - if (CNRT_RET_SUCCESS == cnrtMemcpy(data, const_cast(param_value), + if (cnrtSuccess == cnrtMemcpy(data, const_cast(param_value), data_width, - CNRT_MEM_TRANS_DIR_DEV2HOST)) { + cnrtMemcpyDevToHost)) { op_param.params.push_back({param_name, get_data_string(dtype, data, 0)}); } else { LOG(ERROR) << "[gen_case] dump op param failed, param_name is " diff --git a/core/logging.h b/core/logging.h index 25bfd85e8..3f77fc407 100644 --- a/core/logging.h +++ b/core/logging.h @@ -95,7 +95,7 @@ cnrtGetLastError(); \ kernel; \ cnrtRet_t ret = cnrtPeekAtLastError(); \ - if (MLUOP_PREDICT_FALSE(CNRT_RET_SUCCESS != ret)) { \ + if (MLUOP_PREDICT_FALSE(cnrtSuccess != ret)) { \ LOG(ERROR) << "Check failed: Found " << cnrtGetErrorStr(ret) \ << " after invoke kernel " #kernel; \ return MLUOP_STATUS_EXECUTION_FAILED; \ diff --git a/core/runtime/device.h b/core/runtime/device.h index 38ab31015..fe8aed516 100644 --- a/core/runtime/device.h +++ b/core/runtime/device.h @@ -121,17 +121,17 @@ inline cnrtFunctionType_t castCnKernelClassToCnrtFuncType(KernelClass jobType) { default: return CNRT_FUNC_TYPE_MUTABLE; case CN_KERNEL_CLASS_BLOCK: - return CNRT_FUNC_TYPE_BLOCK; + return cnrtFuncTypeBlock; case CN_KERNEL_CLASS_UNION: - return CNRT_FUNC_TYPE_UNION1; + return cnrtFuncTypeUnion1; case CN_KERNEL_CLASS_UNION2: - return CNRT_FUNC_TYPE_UNION2; + return cnrtFuncTypeUnion2; case CN_KERNEL_CLASS_UNION4: - return CNRT_FUNC_TYPE_UNION4; + return cnrtFuncTypeUnion4; case CN_KERNEL_CLASS_UNION8: - return CNRT_FUNC_TYPE_UNION8; + return cnrtFuncTypeUnion8; case CN_KERNEL_CLASS_UNION16: - return CNRT_FUNC_TYPE_UNION16; + return cnrtFuncTypeUnion16; } } diff --git a/core/tensor.cpp b/core/tensor.cpp index cd9602fb2..ad7234c68 100644 --- a/core/tensor.cpp +++ b/core/tensor.cpp @@ -310,8 +310,12 @@ struct mluOpTensorDescriptorQueueStruct { extend_num *= 2; } - // Let the OS do the cleanup since it's a global variable - ~mluOpTensorDescriptorQueueStruct() {} + // cleanup headers + ~mluOpTensorDescriptorQueueStruct() { + for (auto header : headers) { + delete[] header; + } + } inline void lock() { while (flag.test_and_set(std::memory_order_acquire)) { @@ -321,12 +325,13 @@ struct mluOpTensorDescriptorQueueStruct { inline void extend(size_t n) { mluOpTensorStruct *header = new (std::nothrow) mluOpTensorStruct[n]; for (size_t i = 0; i < n; ++i) { - mluOpTensorStruct *desc = header + i; - queue.push_front(desc); + queue.push_front(header + i); } + headers.push_back(header); } size_t extend_num = 128; std::deque queue; + std::vector headers; std::atomic_flag flag = ATOMIC_FLAG_INIT; }; diff --git a/docs/BANG C OPS-Develop-Guide.md b/docs/BANG C OPS-Develop-Guide.md index ad1b8893d..7eb995688 100644 --- a/docs/BANG C OPS-Develop-Guide.md +++ b/docs/BANG C OPS-Develop-Guide.md @@ -458,7 +458,7 @@ double get_io_bandwidth() { } void get_policy_function_block(cnrtDim3_t *dim, cnrtFunctionType_t *func_type) { - *func_type = CNRT_FUNC_TYPE_BLOCK; + *func_type = cnrtFuncTypeBlock; dim->x = 1; dim->y = 1; dim->z = 1; @@ -466,7 +466,7 @@ void get_policy_function_block(cnrtDim3_t *dim, cnrtFunctionType_t *func_type) { } void get_policy_function_union1(cnrtDim3_t *dim, cnrtFunctionType_t *func_type) { - *func_type = CNRT_FUNC_TYPE_UNION1; + *func_type = cnrtFuncTypeUnion1; dim->x = get_core_num_per_cluster(); dim->y = get_cluster_num(); dim->z = 1; @@ -702,7 +702,7 @@ int main() { cnDeviceGetAttribute(&dimX, CN_DEVICE_ATTRIBUTE_MAX_CORE_COUNT_PER_CLUSTER, dev); cnrtDim3_t dim3 = {dimX, 1, 1}; - cnrtFunctionType_t ktype = CNRT_FUNC_TYPE_UNION1; + cnrtFunctionType_t ktype = cnrtFuncTypeUnion1; foo<<>>(); CHRT_CHECK(cnrtSyncQueue(queue)); @@ -924,7 +924,7 @@ UnionN (N=1, 2, 4, 8, ...) 任务表示一个 Kernel 在执行时至少需要占 ```c++ void void get_policy_function_union1(cnrtDim3_t *dim, cnrtFunctionType_t *func_type) { - *func_type = CNRT_FUNC_TYPE_UNION1; + *func_type = cnrtFuncTypeUnion1; dim->x = get_core_num_per_cluster(); dim->y = get_cluster_num(); dim->z = 1; @@ -2371,7 +2371,7 @@ mlu_op.h // mluop 接口声明文件 cnrtFunctionType_t *k_type) { size_t dim = mluOpGetTensorElementNum(desc); // Union1 policyFunc - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; k_dim->x = handle->core_num_per_cluster; k_dim->y = mluop::runtime::getClusterLimitCapability(handle); k_dim->z = 1; diff --git a/docs/MLU-OPS-OpList.md b/docs/MLU-OPS-OpList.md index 1b04d2815..1fb78cdfa 100755 --- a/docs/MLU-OPS-OpList.md +++ b/docs/MLU-OPS-OpList.md @@ -57,7 +57,9 @@ MLU Binary Op算子结构:  | indice_convolution_backward_data | √ | | | indice_convolution_backward_filter | √ | | | indice_convolution_forward | √ | | +| lgamma | √ | | | log | √ | | +| logspace | √ | | | masked_col2im_forward | √ | | | masked_im2col_forward | √ | | | moe_dispatch_backward_data | √ | | diff --git a/docs/design_docs/ball_query/ball_query.md b/docs/design_docs/ball_query/ball_query.md index f72f2b3a9..66cb409c8 100644 --- a/docs/design_docs/ball_query/ball_query.md +++ b/docs/design_docs/ball_query/ball_query.md @@ -356,7 +356,6 @@ __bang_add(vec_sub_x1, vec_sub_x1, vec_sub_z1, PAD_UP(num_deal_xyz, 64)); ``` step 6:对于step5得到dist2, dist2 和min_radius2和max_radius2、0进行对比,求出dist2== 0||  (dist2 >= min_radius2 && dist2 < max_radius2)对应位置的值为1(表示当前xyz点在以new_xyz点为球心,以min_radius和max_radius为半径的球域内)。 ```C++ -#if __BANG_ARCH__ >= 372 // distance2 >= min_radius2 __bang_ge_scalar(tmp_addr, distance2, min_radius2, num_deal_xyz); // distance2 < max_radius2 @@ -367,20 +366,6 @@ step 6:对于step5得到dist2, dist2 和min_radius2和max_radius2、0进行 __bang_eq_scalar(output_addr, distance2, 0, num_deal_xyz); // distance2 == 0 | min_radius2 <= distance2 < max_radius2 __bang_or(output_addr, output_addr, tmp_addr, num_deal_xyz); -#else - // distance2 >= min_radius2 - __bang_ge_scalar(tmp_addr, distance2, min_radius2, num_deal_xyz); - // distance2 < max_radius2 - __bang_ge_scalar(output_addr, distance2, max_radius2, num_deal_xyz); - __bang_not(output_addr, output_addr, num_deal_xyz); - // min_radius2 <= distance2 < max_radius2 - __bang_and(tmp_addr, tmp_addr, output_addr, num_deal_xyz); - // distance2 == 0 - // __bang_write_zero(tmp2, num_deal_xyz);// 提前 - __bang_eq(output_addr, distance2, zeros_addr, num_deal_xyz); - // distance2 == 0 | min_radius2 <= distance2 < max_radius2 - __bang_or(output_addr, output_addr, tmp_addr, num_deal_xyz); -#endif ``` step7:通过__bang_select把在球域内点的index选出 ```C++ @@ -449,7 +434,7 @@ vec_idx_num[k] += selected_num; k_dim->y = needed_cluster_num > cluster_num ? cluster_num : needed_cluster_num; k_dim->z = 1; - cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1; + cnrtFunctionType_t k_type = cnrtFuncTypeUnion1; ``` ![image](ballquery_split.jpg) diff --git a/docs/design_docs/carafe_backward/carafe_backward.md b/docs/design_docs/carafe_backward/carafe_backward.md index 242f5fea5..0f4685a88 100755 --- a/docs/design_docs/carafe_backward/carafe_backward.md +++ b/docs/design_docs/carafe_backward/carafe_backward.md @@ -117,13 +117,13 @@ CARAFE总模块一共有2部分组成,分别是Kernel Prediction Module和Cont | 原位限制 | 不支持原位 | | stride限制 | 不支持stride机制 | | 广播限制 | 不支持广播 | -| 其他限制 | 算子用到atomic_add指令,half类型在200系列精度较低,因此在200系列仅支持小规模case。| +| 其他限制 | 无 | ### 1.5 验收标准 #### 1.5.1 精度验收标准 -- MLUOP精度验收标准:该算子为累加类算子,MLU300系列采用当前的 diff1 diff2 动态阈值评价公式,MLU200系列采用静态阈值 (3e-3)。 +- MLUOP精度验收标准:该算子为累加类算子,MLU300系列采用当前的 diff1 diff2 动态阈值评价公式。 #### 1.5.2 性能验收标准 diff --git a/docs/design_docs/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.md b/docs/design_docs/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.md index 539abf350..3b4aed88d 100644 --- a/docs/design_docs/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.md +++ b/docs/design_docs/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.md @@ -319,7 +319,7 @@ KERNEL_CHECK((KernelMaskFillCoorsForward( CALL_CNNL(cnnlDestroyUniqueDescriptor(unique_desc)); int32_t num_voxels = 0; - cnrtMemcpy(&num_voxels, voxel_num, sizeof(int), CNRT_MEM_TRANS_DIR_DEV2HOST); + cnrtMemcpy(&num_voxels, voxel_num, sizeof(int), cnrtMemcpyDevToHost); ``` - kernel3: KernelRemoveFirstForward diff --git a/docs/design_docs/focal_loss_sigmoid_backward/focal_loss_sigmoid_backward.md b/docs/design_docs/focal_loss_sigmoid_backward/focal_loss_sigmoid_backward.md index ec3667391..f38d36020 100644 --- a/docs/design_docs/focal_loss_sigmoid_backward/focal_loss_sigmoid_backward.md +++ b/docs/design_docs/focal_loss_sigmoid_backward/focal_loss_sigmoid_backward.md @@ -151,7 +151,7 @@ gradInput = FL^{'} *weight* gradOutput = | ------------ | ------------------------------------------------------------ | | 数据类型限制 | 数据类型需与1.3小节匹配 | | 布局限制 | 物理布局需与1.3小节匹配 | -| 规模限制 | 1.gamma暂不支持小于等于0的规模
2. 当weight为NULL时,target中元素的取值为[0,C];当weight不为NULL时,target中元素的取值为[0,C-1]
3. 此版本优先支持多核之间C维度不拆的case。当weight为NULL时,在MLU200系列板卡中C的范围需在[0, 8154], MLU370板卡中C的范围需在[0,13615];当weight不为NULL时,在MLU200系列板卡中C的范围需在[0, 7520], MLU370板卡中C的范围需在[0,12544];
4. 由于硬件激活指令精度不足,在MLU200系列板块中,gamma的取值范围需在[0, 8],在MLU370中,gamma的取值范围需在[0, 10000]
5. weight 暂不支持包含 inf 和 -inf 的输入 | +| 规模限制 | 1.gamma暂不支持小于等于0的规模
2. 当weight为NULL时,target中元素的取值为[0,C];当weight不为NULL时,target中元素的取值为[0,C-1]
3. 此版本优先支持多核之间C维度不拆的case。当weight为NULL时,MLU370板卡中C的范围需在[0,13615];当weight不为NULL时,MLU370板卡中C的范围需在[0,12544];
4. 由于硬件激活指令精度不足,在MLU370中,gamma的取值范围需在[0, 10000]
5. weight 暂不支持包含 inf 和 -inf 的输入 | | 功能限制 | 1. reduction为预留参数,暂不支持None以外的情况
2.reduction为预留参数,暂不支持HIGH_PRECISION模式
3.此版本暂不支持输入数据为half类型 | ### 1.5 验收标准 diff --git a/docs/design_docs/generate_proposals_v2/generate_proposals_v2.md b/docs/design_docs/generate_proposals_v2/generate_proposals_v2.md index 35f58368c..fa534e23d 100644 --- a/docs/design_docs/generate_proposals_v2/generate_proposals_v2.md +++ b/docs/design_docs/generate_proposals_v2/generate_proposals_v2.md @@ -606,13 +606,9 @@ __mul_func__ void getTopKVal(T * scores, T * bbox_deltas, T *anchors, T *varianc // output = exp(input) __mlu__func void calcExp(T * output, const T * input, cpnst int length){ -#if __BANG_ARCH__ >= 372 #define LOG_2_E (1.44269504088f) __bang_mul_scalar(output, input, (float)LOG_2_E, length); __bang_pow2(output, output, length); -#else - __bang_active(output, input, length); -#endif } // 生成proposals __mlu__func void proposalsBoxesDecode(const T* anchor, const T *deltas, const T *var, const int deal_size, T * proposals, T *nram_temp, bool pixes_offset = true){ diff --git a/docs/design_docs/lgamma/lgamma.md b/docs/design_docs/lgamma/lgamma.md index 22792d341..a49a04b0e 100644 --- a/docs/design_docs/lgamma/lgamma.md +++ b/docs/design_docs/lgamma/lgamma.md @@ -144,13 +144,17 @@ lgamma 算子是 element wise 类型的算子,因此只需要按照输入数 - step3 使用 [Spouge's approximation](https://en.wikipedia.org/wiki/Spouge%27s_approximation) 算法进行计算,考虑到精度,half 需升格为 float 完成计算再转换回 half - $$ \Gamma \left(z + 1\right) = (z + a)^{z + 0.5}e^{ - z - a}\left(c_{0} + \sum \limits_{k = 1}^{a - 1}\frac{c_{k}}{z + k} + \varepsilon _{a}\left(z\right)\right) $$ +$$ +\Gamma \left(z + 1\right) = (z + a)^{z + 0.5}e^{ - z - a}\left(c_{0} + \sum \limits_{k = 1}^{a - 1}\frac{c_{k}}{z + k} + \varepsilon _{a}\left(z\right)\right) +$$ 其中整数 a 的取值决定了 $c_{i}$ 的值,具体计算参考链接网页;且要求 $z > 0$ 计算 Lgamma 函数则是在 Gamma 函数数值逼近算法上取 Log 后,先进行公式上的化简再计算值。同时考虑到以上 Gamma 函数的逼近算法只对输入大于 0 的情况有效,需要通过 [Euler's reflection formula](https://en.wikipedia.org/wiki/Reflection_formula) 计算 $z <= 0$ 的情况: - $$ \Gamma(1-z) \Gamma(z) = \frac{\pi}{sin(\pi z)}$ => $ \Gamma(z) = \frac{\pi}{sin(\pi z) \Gamma(1-z) }$$ +$$ +\Gamma(1-z) \Gamma(z) = \frac{\pi}{sin(\pi z)} => \Gamma(z) = \frac{\pi}{sin(\pi z) \Gamma(1-z) } +$$ ### 3.2 伪代码实现 ``` @@ -175,7 +179,7 @@ for (size_t k = 1; k < numCoeff; k++) { } float lgamma_x = (reflect_x+0.5)*log(reflect_x+numCoeff) - (reflect_x+numCoeff) + log(accm/reflect_x); -// 为保证 abs(log(sin(pi * z)) 计算精度 +// 为保证 abs(log(sin(pi * z))) 计算精度 float abs_input = std::abs(x); float abs_frac_input = abs_input - std::floor(abs_input); float reduced_frac_input = (abs_frac_input > 0.5) ? 1 - abs_frac_input : abs_frac_input; diff --git a/docs/design_docs/nms_rotated/nms_rotated.md b/docs/design_docs/nms_rotated/nms_rotated.md index c0edf9893..0a1936f73 100755 --- a/docs/design_docs/nms_rotated/nms_rotated.md +++ b/docs/design_docs/nms_rotated/nms_rotated.md @@ -101,8 +101,6 @@ NmsRotated 算子有 2 个输入 Tensor,分别为 `boxes`[N,5] or [N,6], `scor - 该算子输出`output`为所选择的box的索引数据。`output`是 int32_t 数据类型。因此,该算子采用静态阈值,阈值标准:diff3 = 0. -- 注意:MLU200 系列精度需要限定数值范围和规模大小,避免计算IOU时出现大规模随机错误。 - #### 1.5.2 性能验收标准 ## 2 算子接口设计 @@ -186,7 +184,6 @@ mluOpStatus_t MLUOP_WIN_API mluOpNmsRotated(mluOpHandle_t handle, - 单核BLOCK任务下: 1. MLU370: box_num < 3200 不超时 - 2. MLU290: box_num < 1100 不超时 ### 3.8 算子防呆检查 diff --git a/docs/design_docs/points_in_boxes/points_in_boxes.md b/docs/design_docs/points_in_boxes/points_in_boxes.md index 36ceb8b4a..1f6b83a83 100644 --- a/docs/design_docs/points_in_boxes/points_in_boxes.md +++ b/docs/design_docs/points_in_boxes/points_in_boxes.md @@ -313,7 +313,7 @@ void points_in_boxes_kernel(int batch_size, int boxes_num, int pts_num, const fl __bang_mul_scalar(tmp, tmp, t+1, num); __bang_add(last, last, tmp, num); __bang_sub_scalar(last, last, -1, num); - __mluop_float2int(last, last, num); + __bang_float2int(last, last, num); store_async(output_addr, last); } diff --git a/docs/design_docs/poly_nms/poly_nms.md b/docs/design_docs/poly_nms/poly_nms.md index 8749b70d2..01e49f226 100644 --- a/docs/design_docs/poly_nms/poly_nms.md +++ b/docs/design_docs/poly_nms/poly_nms.md @@ -131,8 +131,7 @@ dets = [[0, 0, 2, 0, 2, 2, 0, np.nan, 2], [1.5, 1.5, 2.5, 1.5, 2.5, 2.5, 1.5, 2. | 原位限制 | 不支持原位 | | stride限制 | 不支持stride | | 广播限制 | 不支持广播 | -| 规模限制 | mlu270,mlu290及mlu370上输入boxes个数不超过9770个,超过规模限制会有打印报错日志。| - +| 规模限制 | mlu370上输入boxes个数不超过9770个,超过规模限制会有打印报错日志。| ### 1.5 验收标准 #### 1.5.1 精度验收标准 diff --git a/docs/design_docs/roiaware_pool3d_forward/roiaware_pool3d_forward.md b/docs/design_docs/roiaware_pool3d_forward/roiaware_pool3d_forward.md index 514fef5ec..e05c0d49e 100644 --- a/docs/design_docs/roiaware_pool3d_forward/roiaware_pool3d_forward.md +++ b/docs/design_docs/roiaware_pool3d_forward/roiaware_pool3d_forward.md @@ -274,21 +274,21 @@ __global__ roiaware_pool3d_forward_kernel(){ __device__ check_point_in_roi(float *X, float *Y, float *Z, float *local_X, float *local_Y, float *local_Z, float cx, float cy, float cz, float x_size, float y_size, float z_size, float rz, int num) { - __bang_sub_const(local_Z, Z, cz, num); // local_Z + __bang_sub_scalar(local_Z, Z, cz, num); // local_Z __bang_active_abs(tmp, local_Z, num); __bang_write_value(tmp1, num, 0.5 * dz); __bang_le(flag, tmp, tmp1, num); // Z in_flag float cosa = std::cos(-rz); float sina = std::sin(-rz); - __bang_sub_const(tmp, X, cx, num); - __bang_sub_const(tmp1, Y, cy, num); - __bang_mul_const(tmp2, tmp, cosa, num); - __bang_mul_const(tmp3, tmp1, sina, num); - __bang_sub_const(local_X, tmp2, tmp3, num); // local_X - - __bang_mul_const(tmp, tmp, sina, num); - __bang_mul_const(tmp1, tmp1, cosa, num); - __bang_add_const(local_Y, tmp, tmp1, num); // local_Y + __bang_sub_scalar(tmp, X, cx, num); + __bang_sub_scalar(tmp1, Y, cy, num); + __bang_mul_scalar(tmp2, tmp, cosa, num); + __bang_mul_scalar(tmp3, tmp1, sina, num); + __bang_sub_scalar(local_X, tmp2, tmp3, num); // local_X + + __bang_mul_scalar(tmp, tmp, sina, num); + __bang_mul_scalar(tmp1, tmp1, cosa, num); + __bang_add_scalar(local_Y, tmp, tmp1, num); // local_Y __bang_active_abs(tmp2, tmp2, num); __bang_write_value(tmp1, num, 0.5 * dx); diff --git a/docs/design_docs/roipoint_pool3d/roipoint_pool3d.md b/docs/design_docs/roipoint_pool3d/roipoint_pool3d.md index 99bee1638..60d1f2d98 100644 --- a/docs/design_docs/roipoint_pool3d/roipoint_pool3d.md +++ b/docs/design_docs/roipoint_pool3d/roipoint_pool3d.md @@ -364,12 +364,7 @@ void check_pts_in_box3d(const T *boxes3d, // |z - cz| __bang_active_abs(auxiliary_c, auxiliary_c, deal_num); // |z - cz| > dz / 2.0 -#if __BANG_ARCH__ >= 322 __bang_gt_scalar(auxiliary_c, auxiliary_c, (0.5 * dz), deal_num); -#else - __bang_write_value(auxiliary_d, deal_num, (0.5 * dz)); - __bang_lt(auxiliary_c, auxiliary_d, auxiliary_c, deal_num); -#endif // !(|z - cz| > dz / 2.0) __bang_not(auxiliary_c, auxiliary_c, deal_num); // (x - cx) * cos(-rz) @@ -381,12 +376,7 @@ void check_pts_in_box3d(const T *boxes3d, // |local_x| __bang_active_abs(auxiliary_d, auxiliary_d, deal_num); // |local_x| < dx / 2.0 -#if __BANG_ARCH__ >= 322 __bang_lt_scalar(auxiliary_d, auxiliary_d, (0.5 * dx), deal_num); -#else - __bang_write_value(auxiliary_e, deal_num, (0.5 * dx)); - __bang_gt(auxiliary_d, auxiliary_e, auxiliary_d, deal_num); -#endif // (x - cx) * sin(-rz) __bang_mul_scalar(auxiliary_e, auxiliary_a, sina, deal_num); // (y - cy) * cos(-rz) @@ -396,12 +386,7 @@ void check_pts_in_box3d(const T *boxes3d, // |local_y| __bang_active_abs(auxiliary_e, auxiliary_e, deal_num); // |local_y| < dy / 2.0 -#if __BANG_ARCH__ >= 322 __bang_lt_scalar(auxiliary_e, auxiliary_e, (0.5 * dy), deal_num); -#else - __bang_write_value(auxiliary_f, deal_num, (0.5 * dy)); - __bang_gt(auxiliary_e, auxiliary_f, auxiliary_e, deal_num); -#endif // pts_assign = |x - cx| < dx / 2.0 && |y - cy| < dy / 2.0 && |z - cz| <= dz / 2.0 __bang_mul(pts_assign, auxiliary_c, auxiliary_d, deal_num); __bang_mul(pts_assign, pts_assign, auxiliary_e, deal_num); diff --git a/docs/design_docs/three_nn_forward/three_nn_forward.md b/docs/design_docs/three_nn_forward/three_nn_forward.md index e0d136215..3b1f36edb 100644 --- a/docs/design_docs/three_nn_forward/three_nn_forward.md +++ b/docs/design_docs/three_nn_forward/three_nn_forward.md @@ -326,8 +326,8 @@ if(unknown_rem > 0) { ``` - step 5. 计算最小前3个`dist2`及其`index`。 - - a. 若只有一个`known_segment`, 使用`__bang_min()` 对`known_segment`所有点的dist2计算最小值及index,再将该位置置对应数据类型的最大值,进行下一次取最小值,重复3次可得前3最小值及其index,分别存储在aux_a, aux_b。 - - b. 若`known`数据需分多个`segment`处理,将每个分块中的最小前3`dist2`收集后再次进行比较得出最小前3`dist2`,才为`unknown`点跟本`known`集合所有点的最小前3`dist2`。如上使用`__bang_min()`重复取值,但分别存储在`aux_a + offset`, `aux_b + offset`,`offset` 随着每处理一个`known_segment`往后移动`3个数据位置`。如`aux_a`空间存满,则对`aux_a`已存在的`dist2`数据进行一次最小前3 `dist2`计算,并放在 `aux_a, aux_b`前3位置,其他位置清空,以待处理下一个`known_segment`。 + - a. 若只有一个`known_segment`, 使用`__bang_argmin()` 对`known_segment`所有点的dist2计算最小值及index,再将该位置置对应数据类型的最大值,进行下一次取最小值,重复3次可得前3最小值及其index,分别存储在aux_a, aux_b。 + - b. 若`known`数据需分多个`segment`处理,将每个分块中的最小前3`dist2`收集后再次进行比较得出最小前3`dist2`,才为`unknown`点跟本`known`集合所有点的最小前3`dist2`。如上使用`__bang_argmin()`重复取值,但分别存储在`aux_a + offset`, `aux_b + offset`,`offset` 随着每处理一个`known_segment`往后移动`3个数据位置`。如`aux_a`空间存满,则对`aux_a`已存在的`dist2`数据进行一次最小前3 `dist2`计算,并放在 `aux_a, aux_b`前3位置,其他位置清空,以待处理下一个`known_segment`。 ```c++ // T *aux_a: [(0, 1, 2), (3, 4, 5), ..., 128/sizeof(T)] // int *aux_b: [(0, 1, 2), (3, 4, 5), ..., 64] @@ -339,7 +339,7 @@ __mlu_func__ void auxFuncSort(char *aux_a, char *aux_b) { for (int i = 0; i < 3; i++) { - __bang_min((T *)dest, aux_a, NFU_ALIGN_SIZE / sizeof(T)); + __bang_argmin((T *)dest, aux_a, NFU_ALIGN_SIZE / sizeof(T)); (T *)value[i] = (T)dest[0]; int index = getIndice((T)dest); indice[i] = aux_b[index]; @@ -360,7 +360,7 @@ if(deal_offset >= NFU_ALIGN_SIZE/sizeof(T) / 3) { char dest[NFU_ALIGN_SIZE]; for (int i = 0; i < 3; i++) { - __bang_min((T *)dest, (T *)dist, known_num_deal); + __bang_argmin((T *)dest, (T *)dist, known_num_deal); (T *)nram_aux_a[i + deal_offset] = (T)dest[0]; int index = getIndice((T)dest); nram_aux_b[i + deal_offset] = index; @@ -390,7 +390,7 @@ if(deal_offset > 3) { k_dims.x = mluop::runtime::getCoreNumOfEachUnionCapability(handle); k_dims.y = mluop::runtime::getClusterLimitCapability(handle); k_dims.z = 1; - cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1; + cnrtFunctionType_t k_type = cnrtFuncTypeUnion1; ``` 数据拆分如图所示: diff --git a/docs/user_guide/9_operators/index.rst b/docs/user_guide/9_operators/index.rst index bf9b37a28..3893e03a6 100755 --- a/docs/user_guide/9_operators/index.rst +++ b/docs/user_guide/9_operators/index.rst @@ -757,3 +757,39 @@ mluOpExecFFT - ``y`` 为输出信号。 - :math:`DFT_{N}` 为长度为N傅里叶变换的变换矩阵。 +.. _logspace: + +mluOpLogspace +--------------- +该算子输出长度为 `steps` 的数组,其值为以 `base` 为底数,以[start,end]区间上均分部分的数为指数生成的幂。 + +计算公式如下: + +.. math:: + + y = base^{start + i * (end-start) / (steps - 1)} + +其中: + +- ``base`` 为输入底数。 +- ``start,end`` 分别为指数区间的上下限。 +- ``steps`` 为指数区间的均分块数。 +- ``i`` 为均分块数的索引。 + +.. _lgamma: + +mluOpLgamma +--------------- +该算子根据 `lgamma` 函数计算输入张量input中每个元素的输出,输出相同Shape的张量。 + +计算公式如下: + +.. math:: + + lgamma(x) = ln | \Gamma (x)| \\ + \Gamma(x) = \int_{0}^{+\infty} t^{x-1} e^{-t} dt \quad(x>0) + +其中: + +- ``x`` 为输入张量。 + diff --git a/independent_build.sh b/independent_build.sh index bd9cece4a..3676e8213 100755 --- a/independent_build.sh +++ b/independent_build.sh @@ -111,7 +111,7 @@ usage () { echo " --asan Build with asan check enabled" echo " -d, --debug Build mlu-ops with debug mode" echo " --disable-gtest Build mlu-ops without gtest" - echo " --enable-bang-memcheck Build with cncc '-mllvm -enable-mlisa-sanitizer -Xbang-cnas -O0 -g' arg to enable memcheck" + echo " --enable-bang-memcheck (Deprecated, use CNSanitizer instead) Build with cncc '-mllvm -enable-mlisa-sanitizer -Xbang-cnas -O0 -g' arg to enable memcheck" echo " --enable-static Build mlu-ops static library" echo " --mlu370 Build for target product MLU370: __BANG_ARCH__ = 372" echo " __MLU_NRAM_SIZE__ = 768KB" @@ -326,6 +326,7 @@ if [ $# != 0 ]; then --enable-bang-memcheck) shift export MLUOP_BUILD_BANG_MEMCHECK="ON" + prog_log_warn "[deprecated] bang memcheck, consider use CNSanitizer instead" && sleep 3 ;; --enable-static) shift @@ -469,7 +470,7 @@ pushd ${BUILD_PATH} > /dev/null -DBUILD_VERSION="${BUILD_VERSION}" \ -DMAJOR_VERSION="${MAJOR_VERSION}" \ -DMLUOP_BUILD_ASAN_CHECK="${MLUOP_BUILD_ASAN_CHECK}" \ - -DMLUOP_BUILD_BANG_MEMCHECK="${MLUOP_BUILD_BANG_MEMCHECK}" \ + -DMLUOP_BUILD_BANG_MEMCHECK="${MLUOP_BUILD_BANG_MEMCHECK:-OFF}" \ -DMLUOP_MLU_ARCH_LIST="${MLUOP_MLU_ARCH_LIST}" \ -DMLUOP_TARGET_CPU_ARCH="${MLUOP_TARGET_CPU_ARCH}" \ -DMLUOP_BUILD_SPECIFIC_OP="${MLUOP_BUILD_SPECIFIC_OP}" \ diff --git a/kernels/abs/abs_block.mlu b/kernels/abs/abs_block.mlu index 6a1246a2d..37990dd9c 100644 --- a/kernels/abs/abs_block.mlu +++ b/kernels/abs/abs_block.mlu @@ -32,7 +32,7 @@ #include "kernels/unary_op/complex_unary_op_3pipeline.h" #include "kernels/unary_op/complex_unary_op_stride_3pipeline.h" -__nram__ char nram_buffer[UNARY_NRAM_SIZE]; +__nram__ int8_t nram_buffer[UNARY_NRAM_SIZE]; template __mlu_func__ void auxFunc3AbsFloat(size_t &output_input_gap, @@ -75,23 +75,27 @@ __mlu_func__ void auxComplexFunc3AbsComplexFloat( } template -__mlu_func__ void computeAbsHalfBfloat16(char *nram_output, char *nram_input, - char *auxiliary_a, char *auxiliary_b, - size_t deal_num, size_t actual_num) { +__mlu_func__ void computeAbsHalfBfloat16(int8_t *nram_output, + int8_t *nram_input, + int8_t *auxiliary_a, + int8_t *auxiliary_b, size_t deal_num, + size_t actual_num) { __bang_abs((T2 *)nram_output, (T1 *)nram_input, deal_num); } template -__mlu_func__ void computeAbsFloat(char *nram_output, char *nram_input, - char *auxiliary_a, char *auxiliary_b, +__mlu_func__ void computeAbsFloat(int8_t *nram_output, int8_t *nram_input, + int8_t *auxiliary_a, int8_t *auxiliary_b, size_t deal_num, size_t actual_num) { __bang_abs((T2 *)nram_output, (T1 *)nram_input, deal_num); } template -__mlu_func__ void computeAbsComplexFloat(char *nram_output, char *nram_input, - char *auxiliary_a, char *auxiliary_b, - size_t deal_num, size_t actual_num) { +__mlu_func__ void computeAbsComplexFloat(int8_t *nram_output, + int8_t *nram_input, + int8_t *auxiliary_a, + int8_t *auxiliary_b, size_t deal_num, + size_t actual_num) { T2 *aux_a = (T2 *)nram_input + 2 * deal_num; T2 *aux_b = (T2 *)nram_input + 3 * deal_num; __bang_write_value((T2 *)auxiliary_a, deal_num * sizeof(T2), (int8_t)0xAA); @@ -122,15 +126,17 @@ Kernel3StagePipelineAbs(const cnrtDim3_t k_dim, const cnrtFunctionType_t k_type, const cnrtQueue_t queue, const mluOpDataType_t d_type, const void *x, void *y, size_t element_num) { if (d_type == MLUOP_DTYPE_FLOAT || d_type == MLUOP_DTYPE_INT32) { - KERNEL_CHECK(MLUBlockKernel3StagePipelineAbsFloat - <<>>((char *)x, (char *)y, element_num)); + KERNEL_CHECK( + MLUBlockKernel3StagePipelineAbsFloat + <<>>((int8_t *)x, (int8_t *)y, element_num)); } else if (d_type == MLUOP_DTYPE_HALF || d_type == MLUOP_DTYPE_BFLOAT16) { - KERNEL_CHECK(MLUBlockKernel3StagePipelineAbsHalfBfloat16 - <<>>((char *)x, (char *)y, element_num)); + KERNEL_CHECK( + MLUBlockKernel3StagePipelineAbsHalfBfloat16 + <<>>((int8_t *)x, (int8_t *)y, element_num)); } else { KERNEL_CHECK( MLUBlockKernel3StagePipelineComplexAbsComplexFloat - <<>>((char *)x, (char *)y, element_num)) + <<>>((int8_t *)x, (int8_t *)y, element_num)) } return MLUOP_STATUS_SUCCESS; } @@ -142,17 +148,17 @@ mluOpStatus_t MLUOP_WIN_API Kernel3StagePipelineWithStrideAbs( size_t element_num) { if (d_type == MLUOP_DTYPE_FLOAT || d_type == MLUOP_DTYPE_INT32) { KERNEL_CHECK(MLUBlockKernel3StagePipelineWithStrideAbsFloat - <<>>((char *)x, x_shape, (char *)y, + <<>>((int8_t *)x, x_shape, (int8_t *)y, y_shape, element_num)); } else if (d_type == MLUOP_DTYPE_HALF || d_type == MLUOP_DTYPE_BFLOAT16) { KERNEL_CHECK( MLUBlockKernel3StagePipelineWithStrideAbsHalfBfloat16 - <<>>((char *)x, x_shape, (char *)y, y_shape, + <<>>((int8_t *)x, x_shape, (int8_t *)y, y_shape, element_num)); } else { KERNEL_CHECK(MLUBlockKernel3StagePipelineWithStrideComplexAbsComplexFloat< double, float><<>>( - (char *)x, x_shape, (char *)y, y_shape, element_num)) + (int8_t *)x, x_shape, (int8_t *)y, y_shape, element_num)) } return MLUOP_STATUS_SUCCESS; } diff --git a/kernels/active_rotated_filter/active_rotated_filter_unionx.mlu b/kernels/active_rotated_filter/active_rotated_filter_unionx.mlu index cb1daecf6..2971bad61 100644 --- a/kernels/active_rotated_filter/active_rotated_filter_unionx.mlu +++ b/kernels/active_rotated_filter/active_rotated_filter_unionx.mlu @@ -26,7 +26,7 @@ #include "kernels/debug.h" #include "kernels/kernel.h" -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; #define ROWS 8 #define COLS 9 #define CYCLE_ROTATE 360 @@ -180,7 +180,7 @@ __mlu_func__ void computeRotation(T *nram_output, T *nram_input, T *dst_ro = dst_base + layer * kH * kW; switch (r) { case 0: { - __bang_bor((char *)(dst_ro), (char *)(dst_ro), (char *)src_ro, + __bang_bor((int8_t *)(dst_ro), (int8_t *)(dst_ro), (int8_t *)src_ro, kH * kW * sizeof(T)); }; continue; @@ -247,28 +247,6 @@ __mlu_global__ void MLUKernelActiveRotatedFilterForward( {8, 7, 6, 5, 4, 3, 2, 1, 0}, {5, 8, 7, 2, 4, 6, 1, 0, 3}, {2, 5, 8, 1, 4, 7, 0, 3, 6}, {1, 2, 5, 0, 4, 8, 3, 6, 7}}; -#if __BANG_ARCH__ < 322 - for (int i = 0; i < num_op_per_core; i++) { - T *workspace_core = - workspace + (op_core_offset + i) * input_planes * kernel_size; - T *input_core = input + (op_core_offset + i) * input_planes * kernel_size; - T *output_core = - output + (op_core_offset + i) * rotations * input_planes * kernel_size; - - const int src_stride = kH * kW; - - for (int k = 0; k < rotations; k++) { - angle = delta_rotation * (float)k; - block_num = int(angle / delta_orientation); - - orientation_move(workspace_core, input_core, kH, kW, block_num, - total_num, kernel_size, input_planes); - rotateHW(output_core + k * input_planes * kernel_size, workspace_core, - trans, kH, kW, src_stride, src_stride, - orientations * input_planes, int(angle) / ROTATE_BASE_ANGLE); - } - } -#else if (rotations == 8) { for (int i = 0; i < num_op_per_core; i++) { T *workspace_core = @@ -405,7 +383,6 @@ __mlu_global__ void MLUKernelActiveRotatedFilterForward( __sync(); } } -#endif } mluOpStatus_t MLUOP_WIN_API KernelActiveRotatedFilterForward( diff --git a/kernels/adam_w/adam_w.cpp b/kernels/adam_w/adam_w.cpp index b746adf50..d996fe3b3 100644 --- a/kernels/adam_w/adam_w.cpp +++ b/kernels/adam_w/adam_w.cpp @@ -97,17 +97,18 @@ mluOpAdamW(mluOpHandle_t handle, const mluOpAdamWDescriptor_t adamw_desc, PARAM_CHECK("[mluOpAdamW]", momentum_desc != nullptr); PARAM_CHECK("[mluOpAdamW]", velocity_desc != nullptr); PARAM_CHECK("[mluOpAdamW]", grad_desc != nullptr); - PARAM_CHECK("[mluOpAdamW]", param_desc->dtype == MLUOP_DTYPE_FLOAT) - PARAM_CHECK("[mluOpAdamW]", paramh_desc->dtype == MLUOP_DTYPE_BFLOAT16) - PARAM_CHECK("[mluOpAdamW]", momentum_desc->dtype == MLUOP_DTYPE_FLOAT) - PARAM_CHECK("[mluOpAdamW]", velocity_desc->dtype == MLUOP_DTYPE_FLOAT) - PARAM_CHECK("[mluOpAdamW]", grad_desc->dtype == MLUOP_DTYPE_BFLOAT16) + PARAM_CHECK("[mluOpAdamW]", param_desc->dtype == MLUOP_DTYPE_FLOAT); + PARAM_CHECK("[mluOpAdamW]", paramh_desc->dtype == MLUOP_DTYPE_BFLOAT16); + PARAM_CHECK("[mluOpAdamW]", momentum_desc->dtype == MLUOP_DTYPE_FLOAT); + PARAM_CHECK("[mluOpAdamW]", velocity_desc->dtype == MLUOP_DTYPE_FLOAT); + PARAM_CHECK("[mluOpAdamW]", grad_desc->dtype == MLUOP_DTYPE_BFLOAT16); - PARAM_CHECK_LE("[mluOpAdamW]", beta1, 1.0) - PARAM_CHECK_GE("[mluOpAdamW]", beta1, 0.0) - PARAM_CHECK_LE("[mluOpAdamW]", beta2, 1.0) - PARAM_CHECK_GE("[mluOpAdamW]", beta2, 0.0) - PARAM_CHECK("[mluOpAdamW]", epsilon > 0) + PARAM_CHECK_LE("[mluOpAdamW]", beta1, 1.0); + PARAM_CHECK_GE("[mluOpAdamW]", beta1, 0.0); + PARAM_CHECK_LE("[mluOpAdamW]", beta2, 1.0); + PARAM_CHECK_GE("[mluOpAdamW]", beta2, 0.0); + PARAM_CHECK_GE("[mluOpAdamW]", handle->arch, MLUOP_MLU590); + PARAM_CHECK("[mluOpAdamW]", epsilon > 0); size_t param_dims = 0; size_t paramh_dims = 0; @@ -246,7 +247,7 @@ mluOpAdamW(mluOpHandle_t handle, const mluOpAdamWDescriptor_t adamw_desc, grad_dims_shape); mluOpDataType_t k_data_type = grad_dtype; cnrtDim3_t k_dim; - cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1; + cnrtFunctionType_t k_type = cnrtFuncTypeUnion1; k_dim.x = mluop::runtime::getCoreNumOfEachUnionCapability(handle); k_dim.y = mluop::runtime::getClusterLimitCapability(handle); k_dim.z = 1; @@ -261,7 +262,7 @@ mluOpAdamW(mluOpHandle_t handle, const mluOpAdamWDescriptor_t adamw_desc, GEN_CASE_END(); return MLUOP_STATUS_ARCH_MISMATCH; } - case CNRT_FUNC_TYPE_UNION1: { + case cnrtFuncTypeUnion1: { VLOG(5) << "Launch Kernel KernelApplyAdamW<<>>"; CHECK_RETURN( diff --git a/kernels/adam_w/adam_w_union1.mlu b/kernels/adam_w/adam_w_union1.mlu index e45bf6598..db9cc672a 100644 --- a/kernels/adam_w/adam_w_union1.mlu +++ b/kernels/adam_w/adam_w_union1.mlu @@ -31,7 +31,7 @@ #define SIZE_NRAM_PER_REGION PAD_DOWN((MAX_NRAM_SIZE / 12), NFU_ALIGN_SIZE) #define HIGH_PRECISION_MODE 1 -__nram__ char nbuf_head[MAX_NRAM_SIZE]; +__nram__ int8_t nbuf_head[MAX_NRAM_SIZE]; __mlu_func__ void computeAdamW(bfloat16_t *nbuf_paramh, bfloat16_t *nbuf_grad, float *nbuf_param, float *nbuf_grad_ptr, @@ -140,6 +140,7 @@ __mlu_global__ void unionApplyAdamW(T *param_h, T *grad, float *param, float bias2, float epsilon, float weight_decay, float scale, bool use_nesterov, size_t size) { + PERF_TIME_BEGIN(); if (__is_mpu()) { return; } @@ -218,6 +219,7 @@ __mlu_global__ void unionApplyAdamW(T *param_h, T *grad, float *param, ddr_velocity += num_x; __asm__ volatile("sync;"); } + PERF_TIME_END(); } mluOpStatus_t MLUOP_WIN_API KernelApplyAdamW( @@ -226,10 +228,9 @@ mluOpStatus_t MLUOP_WIN_API KernelApplyAdamW( void *momentum, void *velocity, float lr, float beta1, float beta2, float bias1, float bias2, float epsilon, float weight_decay, float scale, bool use_nesterov, size_t size, mluOpDataType_t k_data_type) { - PERF_TIME_BEGIN(); switch (k_data_type) { default: { - MLULOG("Not Implemented."); + LOG(ERROR) << "Not Implemented."; } case MLUOP_DTYPE_BFLOAT16: { KERNEL_CHECK(unionApplyAdamW<<>>( @@ -238,6 +239,5 @@ mluOpStatus_t MLUOP_WIN_API KernelApplyAdamW( epsilon, weight_decay, scale, use_nesterov, size)); }; break; } - PERF_TIME_END(); return MLUOP_STATUS_SUCCESS; } diff --git a/kernels/ball_query/ball_query.cpp b/kernels/ball_query/ball_query.cpp index f3bbf4fa2..fb6ddebde 100644 --- a/kernels/ball_query/ball_query.cpp +++ b/kernels/ball_query/ball_query.cpp @@ -58,7 +58,7 @@ void policyFuncBallQuery(const mluOpHandle_t &handle, // element. size_t needed_cluster_num = (total_data_num + core_in_cluster - 1) / core_in_cluster; - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; k_dim->x = core_in_cluster; k_dim->y = needed_cluster_num > cluster_num ? cluster_num : needed_cluster_num; diff --git a/kernels/ball_query/ball_query_union1.mlu b/kernels/ball_query/ball_query_union1.mlu index d30426308..33c4b2bca 100644 --- a/kernels/ball_query/ball_query_union1.mlu +++ b/kernels/ball_query/ball_query_union1.mlu @@ -33,7 +33,7 @@ #define REM_FOR_FLOAT2INT32 128 #define ALIGN_NUM 64 -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; template __mlu_func__ void genIndexFunc(T *index, const uint32_t max_index, @@ -64,23 +64,14 @@ __mlu_func__ void convertFloat2Int(int32_t *dst, float *dst_addtion, const uint32_t elem_count, const uint32_t offset) { if (elem_count == 0) return; -#if __BANG_ARCH__ >= 322 int32_t *src = (int32_t *)src_origin; __bang_add_scalar((int32_t *)dst, (int32_t *)src, offset, elem_count); -#else - float *src = (float *)src_origin; - __bang_add_scalar((float *)src, (float *)src, (float)(offset), - CEIL_ALIGN(elem_count, ALIGN_NUM)); - __mluop_float2int32((int32_t *)dst, (float *)dst_addtion, (float *)src, - (float *)src_addtion, CEIL_ALIGN(elem_count, ALIGN_NUM)); -#endif } __mlu_func__ void checkPointsValid(float *distance2, float *tmp_addr, float *output_addr, float *zeros_addr, uint32_t num_deal_xyz, float min_radius2, float max_radius2) { -#if __BANG_ARCH__ >= 322 // distance2 >= min_radius2 __bang_ge_scalar(tmp_addr, distance2, min_radius2, num_deal_xyz); // distance2 < max_radius2 @@ -91,20 +82,6 @@ __mlu_func__ void checkPointsValid(float *distance2, float *tmp_addr, __bang_eq_scalar(output_addr, distance2, 0, num_deal_xyz); // distance2 == 0 | min_radius2 <= distance2 < max_radius2 __bang_or(output_addr, output_addr, tmp_addr, num_deal_xyz); -#else - // distance2 >= min_radius2 - __bang_ge_scalar(tmp_addr, distance2, min_radius2, num_deal_xyz); - // distance2 < max_radius2 - __bang_ge_scalar(output_addr, distance2, max_radius2, num_deal_xyz); - __bang_not(output_addr, output_addr, num_deal_xyz); - // min_radius2 <= distance2 < max_radius2 - __bang_and(tmp_addr, tmp_addr, output_addr, num_deal_xyz); - // distance2 == 0 - // __bang_write_zero(tmp2, num_deal_xyz);// 提前 - __bang_eq(output_addr, distance2, zeros_addr, num_deal_xyz); - // distance2 == 0 | min_radius2 <= distance2 < max_radius2 - __bang_or(output_addr, output_addr, tmp_addr, num_deal_xyz); -#endif } template @@ -133,9 +110,6 @@ __mlu_func__ void ballQueryWorkflow( uint32_t index_start = 0, cur_batch_id = 0; uint32_t same_batch_s = 0, same_batch_e = 0; -#if __BANG_ARCH__ < 322 - __bang_write_zero((float *)tmp2, num_stride); -#endif for (uint32_t i = 0; i < num_loop_new_xyz; ++i) { uint32_t index_new_xyz = task_start + i * num_stride; uint32_t num_deal_new_xyz = i * num_stride + num_stride > num_per_task @@ -281,36 +255,32 @@ __mlu_global__ void MLUUnion1KernelBallQuery( nfu_align_size); const uint32_t num_stride = FLOOR_ALIGN(num_stride1, 64); - char *vec_new_x1 = nram_buffer; - char *vec_new_y1 = vec_new_x1 + num_stride * sizeof(T); - char *vec_new_z1 = vec_new_y1 + num_stride * sizeof(T); - char *vec_idx_num = vec_new_z1 + num_stride * sizeof(T); + int8_t *vec_new_x1 = nram_buffer; + int8_t *vec_new_y1 = vec_new_x1 + num_stride * sizeof(T); + int8_t *vec_new_z1 = vec_new_y1 + num_stride * sizeof(T); + int8_t *vec_idx_num = vec_new_z1 + num_stride * sizeof(T); - char *vec_x1 = vec_idx_num + num_stride * sizeof(int32_t); - char *vec_y1 = vec_x1 + num_stride * sizeof(T); - char *vec_z1 = vec_y1 + num_stride * sizeof(T); - char *vec_index = vec_z1 + num_stride * sizeof(T); + int8_t *vec_x1 = vec_idx_num + num_stride * sizeof(int32_t); + int8_t *vec_y1 = vec_x1 + num_stride * sizeof(T); + int8_t *vec_z1 = vec_y1 + num_stride * sizeof(T); + int8_t *vec_index = vec_z1 + num_stride * sizeof(T); - char *vec_sub_x1 = vec_index + num_stride * sizeof(int32_t); - char *vec_sub_y1 = vec_sub_x1 + num_stride * sizeof(T); - char *vec_sub_z1 = vec_sub_y1 + num_stride * sizeof(T); - char *tmp1 = vec_sub_z1 + num_stride * sizeof(T); + int8_t *vec_sub_x1 = vec_index + num_stride * sizeof(int32_t); + int8_t *vec_sub_y1 = vec_sub_x1 + num_stride * sizeof(T); + int8_t *vec_sub_z1 = vec_sub_y1 + num_stride * sizeof(T); + int8_t *tmp1 = vec_sub_z1 + num_stride * sizeof(T); - char *out1 = tmp1 + num_stride * sizeof(int32_t); - char *out2 = out1 + num_stride * sizeof(T); - char *out3 = out2 + num_stride * sizeof(T); - char *tmp2 = out3 + num_stride * sizeof(T); + int8_t *out1 = tmp1 + num_stride * sizeof(int32_t); + int8_t *out2 = out1 + num_stride * sizeof(T); + int8_t *out3 = out2 + num_stride * sizeof(T); + int8_t *tmp2 = out3 + num_stride * sizeof(T); - char *src_addtion = tmp2 + num_stride * sizeof(int32_t); + int8_t *src_addtion = tmp2 + num_stride * sizeof(int32_t); const float min_radius2 = min_radius * min_radius; const float max_radius2 = max_radius * max_radius; -#if __BANG_ARCH__ >= 322 genIndexFunc((int32_t *)vec_index, num_stride, nfu_align_size); -#else - genIndexFunc((float *)vec_index, num_stride, nfu_align_size); -#endif ballQueryWorkflow( (T *)vec_new_x1, (T *)vec_new_y1, (T *)vec_new_z1, (int32_t *)vec_idx_num, diff --git a/kernels/bbox_overlaps/bbox_overlaps.cpp b/kernels/bbox_overlaps/bbox_overlaps.cpp index 0890eb233..3a82bdae0 100644 --- a/kernels/bbox_overlaps/bbox_overlaps.cpp +++ b/kernels/bbox_overlaps/bbox_overlaps.cpp @@ -39,7 +39,7 @@ static void policyFunc(mluOpHandle_t handle, cnrtDim3_t *k_dim, uint32_t core_num = union_num * core_dim; // Union1 policyFunc - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; k_dim->x = core_dim; uint32_t need_core_num = (batch_num_all + core_dim - 1) / core_dim * core_dim; if (need_core_num < core_num) { diff --git a/kernels/bbox_overlaps/bbox_overlaps_union1.mlu b/kernels/bbox_overlaps/bbox_overlaps_union1.mlu index f398340f9..deaf40a11 100644 --- a/kernels/bbox_overlaps/bbox_overlaps_union1.mlu +++ b/kernels/bbox_overlaps/bbox_overlaps_union1.mlu @@ -34,8 +34,8 @@ #define BBOX_SIZE 32 #define COORD_NUM 4 -__nram__ char nmem_buf[MAX_NRAM_SIZE]; -__nram__ char bbox_nram[BBOX_SIZE]; +__nram__ int8_t nmem_buf[MAX_NRAM_SIZE]; +__nram__ int8_t bbox_nram[BBOX_SIZE]; template __mlu_func__ inline void __mluop_max(T *dst, T *src0, T *src1, size_t num) { diff --git a/kernels/binary_op/binary_op_3pipeline.h b/kernels/binary_op/binary_op_3pipeline.h index c600f9c1e..4e3869dc3 100644 --- a/kernels/binary_op/binary_op_3pipeline.h +++ b/kernels/binary_op/binary_op_3pipeline.h @@ -35,13 +35,13 @@ template \ __mlu_global__ void MLUBlockKernel3StagePipeline##Op##Prefer( \ - char *x, char *y, char *z, size_t element_num, Args... args); + int8_t *x, int8_t *y, int8_t *z, size_t element_num, Args... args); #define BINARY_OP_KERNEL_3PIPELINE(Op, Prefer) \ template \ __mlu_global__ void MLUBlockKernel3StagePipeline##Op##Prefer( \ - char *input1_gdram, char *input2_gdram, char *output_gdram, \ + int8_t *input1_gdram, int8_t *input2_gdram, int8_t *output_gdram, \ size_t element_num, Args... args) { \ if (__is_mpu()) { \ return; \ @@ -56,11 +56,11 @@ auxiliary_b_gap, auxiliary_c_gap, span_num_deal, align_num, args...); \ const size_t num_rem = element_num % taskDim; \ size_t num_per_core = element_num / taskDim; \ - const char *const input1_start = \ + const int8_t *const input1_start = \ input1_gdram + taskId * num_per_core * sizeof(DType_in1); \ - const char *const input2_start = \ + const int8_t *const input2_start = \ input2_gdram + taskId * num_per_core * sizeof(DType_in2); \ - char *const output_start = \ + int8_t *const output_start = \ output_gdram + taskId * num_per_core * sizeof(DType_out); \ if (num_rem > 0 && taskId == taskDim - 1) { \ num_per_core = num_per_core + num_rem; \ @@ -69,12 +69,12 @@ const int32_t repeat = num_per_core / span_num_deal; \ const size_t rem = num_per_core % span_num_deal; \ const size_t align_rem = CEIL_ALIGN(rem, align_num); \ - char *ping_output = nram_buffer; \ - char *ping_input1 = nram_buffer + output_input1_gap; \ - char *ping_input2 = nram_buffer + output_input2_gap; \ - char *auxiliary_a = nram_buffer + auxiliary_a_gap; \ - char *auxiliary_b = nram_buffer + auxiliary_b_gap; \ - char *auxiliary_c = nram_buffer + auxiliary_c_gap; \ + int8_t *ping_output = nram_buffer; \ + int8_t *ping_input1 = nram_buffer + output_input1_gap; \ + int8_t *ping_input2 = nram_buffer + output_input2_gap; \ + int8_t *auxiliary_a = nram_buffer + auxiliary_a_gap; \ + int8_t *auxiliary_b = nram_buffer + auxiliary_b_gap; \ + int8_t *auxiliary_c = nram_buffer + auxiliary_c_gap; \ const size_t span_load_input1_size = span_num_deal * sizeof(DType_in1); \ const size_t span_load_input2_size = span_num_deal * sizeof(DType_in2); \ const size_t span_store_size = span_num_deal * sizeof(DType_out); \ @@ -99,11 +99,9 @@ __asm__ volatile("sync;"); \ } \ for (int32_t i = 0; i < repeat - 2; ++i) { \ - pvLock(); \ __memcpy_async(output_start + i * span_store_size, \ ping_output + (i % 2) * ping_pong_gap, span_store_size, \ NRAM2GDRAM); \ - pvUnlock(); \ __memcpy_async(ping_input1 + (i % 2) * ping_pong_gap, \ input1_start + (i + 2) * span_load_input1_size, \ span_load_input1_size, GDRAM2NRAM); \ @@ -118,11 +116,9 @@ __asm__ volatile("sync;"); \ } \ if (repeat > 1) { \ - pvLock(); \ __memcpy_async(output_start + (repeat - 2) * span_store_size, \ ping_output + ((repeat - 2) % 2) * ping_pong_gap, \ span_store_size, NRAM2GDRAM); \ - pvUnlock(); \ } \ if (rem > 0) { \ __memcpy_async(ping_input1 + (repeat % 2) * ping_pong_gap, \ @@ -141,11 +137,9 @@ } \ __asm__ volatile("sync;"); \ if (repeat > 0) { \ - pvLock(); \ __memcpy_async(output_start + (repeat - 1) * span_store_size, \ ping_output + ((repeat - 1) % 2) * ping_pong_gap, \ span_store_size, NRAM2GDRAM); \ - pvUnlock(); \ } \ if (rem > 0) { \ compute##Op##Prefer( \ @@ -154,11 +148,9 @@ ping_input2 + (repeat % 2) * ping_pong_gap, auxiliary_a, \ auxiliary_b, auxiliary_c, align_rem, rem, args...); \ __asm__ volatile("sync;"); \ - pvLock(); \ __memcpy_async(output_start + repeat * span_store_size, \ ping_output + (repeat % 2) * ping_pong_gap, \ rem * sizeof(DType_out), NRAM2GDRAM); \ - pvUnlock(); \ } \ } @@ -167,20 +159,20 @@ template \ __mlu_global__ void MLUBlockKernel3StagePipelineV2##Op##Prefer( \ - char *x, char *y, char *z, size_t normal_core_elem_num, \ + int8_t *x, int8_t *y, int8_t *z, size_t normal_core_elem_num, \ size_t tail_core_elem_num, Args... args); #define BINARY_OP_KERNEL_3PIPELINE_V2(Op, Prefer) \ template \ __mlu_global__ void MLUBlockKernel3StagePipelineV2##Op##Prefer( \ - char *input1_gdram, char *input2_gdram, char *output_gdram, \ + int8_t *input1_gdram, int8_t *input2_gdram, int8_t *output_gdram, \ size_t normal_core_elem_num, size_t tail_core_elem_num, Args... args) { \ - const char *const input1_start = \ + const int8_t *const input1_start = \ input1_gdram + taskId * normal_core_elem_num * sizeof(DType_in1); \ - const char *const input2_start = \ + const int8_t *const input2_start = \ input2_gdram + taskId * normal_core_elem_num * sizeof(DType_in2); \ - char *const output_start = \ + int8_t *const output_start = \ output_gdram + taskId * normal_core_elem_num * sizeof(DType_out); \ const size_t num_cur_core = \ (taskId + 1 == taskDim) ? tail_core_elem_num : normal_core_elem_num; \ @@ -197,12 +189,12 @@ const uint32_t repeat = num_cur_core / span_num_deal; \ const size_t rem = num_cur_core % span_num_deal; \ const size_t align_rem = CEIL_ALIGN(rem, align_num); \ - char *ping_output = nram_buffer; \ - char *ping_input1 = nram_buffer + output_input1_gap; \ - char *ping_input2 = nram_buffer + output_input2_gap; \ - char *auxiliary_a = nram_buffer + auxiliary_a_gap; \ - char *auxiliary_b = nram_buffer + auxiliary_b_gap; \ - char *auxiliary_c = nram_buffer + auxiliary_c_gap; \ + int8_t *ping_output = nram_buffer; \ + int8_t *ping_input1 = nram_buffer + output_input1_gap; \ + int8_t *ping_input2 = nram_buffer + output_input2_gap; \ + int8_t *auxiliary_a = nram_buffer + auxiliary_a_gap; \ + int8_t *auxiliary_b = nram_buffer + auxiliary_b_gap; \ + int8_t *auxiliary_c = nram_buffer + auxiliary_c_gap; \ const size_t span_load_input1_size = span_num_deal * sizeof(DType_in1); \ const size_t span_load_input2_size = span_num_deal * sizeof(DType_in2); \ const size_t span_store_size = span_num_deal * sizeof(DType_out); \ diff --git a/kernels/binary_op/binary_op_5pipeline.h b/kernels/binary_op/binary_op_5pipeline.h index 6c319803a..39d315f3f 100644 --- a/kernels/binary_op/binary_op_5pipeline.h +++ b/kernels/binary_op/binary_op_5pipeline.h @@ -38,7 +38,7 @@ template \ __mlu_global__ void MLUBlockKernel5StagePipeline##Op##Prefer( \ - char *x, char *y, char *z, size_t data_num, Args... args) + int8_t *x, int8_t *y, int8_t *z, size_t data_num, Args... args) /**************************************************************************** * GDRAM2SRAM: io pipeline @@ -59,7 +59,7 @@ template \ __mlu_global__ void MLUBlockKernel5StagePipeline##Op##Prefer( \ - char *x, char *y, char *z, size_t data_num, Args... args) { \ + int8_t *x, int8_t *y, int8_t *z, size_t data_num, Args... args) { \ size_t span_num_deal = 0; \ size_t output_input1_gap = 0, output_input2_gap = 0; \ size_t auxiliary_a_gap = 0, auxiliary_b_gap = 0, auxiliary_c_gap = 0; \ @@ -90,12 +90,12 @@ } \ size_t align_cluster_rem_to_core = PAD_UP(cluster_rem_to_core, align_num); \ \ - char *nram_out = nram_buffer; \ - char *nram_in1 = nram_buffer + output_input1_gap; \ - char *nram_in2 = nram_buffer + output_input2_gap; \ - char *nram_aux1 = nram_buffer + auxiliary_a_gap; \ - char *nram_aux2 = nram_buffer + auxiliary_b_gap; \ - char *nram_aux3 = nram_buffer + auxiliary_c_gap; \ + int8_t *nram_out = nram_buffer; \ + int8_t *nram_in1 = nram_buffer + output_input1_gap; \ + int8_t *nram_in2 = nram_buffer + output_input2_gap; \ + int8_t *nram_aux1 = nram_buffer + auxiliary_a_gap; \ + int8_t *nram_aux2 = nram_buffer + auxiliary_b_gap; \ + int8_t *nram_aux3 = nram_buffer + auxiliary_c_gap; \ \ if (repeat > 0) { \ __memcpy_async(sram_in1, base_cluster_in1, sram_num * sizeof(DType_in1), \ @@ -223,6 +223,6 @@ template \ __mlu_global__ void MLUBlockKernel5StagePipeline##Op##Prefer( \ - char *x, char *y, char *z, size_t data_num, Args... args) {} + int8_t *x, int8_t *y, int8_t *z, size_t data_num, Args... args) {} #endif #endif // KERNELS_BINARY_OP_BINARY_OP_5PIPELINE_H_ diff --git a/kernels/binary_op/binary_op_host.cpp b/kernels/binary_op/binary_op_host.cpp index c99959c04..195db20c2 100644 --- a/kernels/binary_op/binary_op_host.cpp +++ b/kernels/binary_op/binary_op_host.cpp @@ -49,7 +49,7 @@ void binaryOpPolicyFunc(mluOpHandle_t handle, const int pad_up_size, size = PAD_UP(size, pad_up_size); // Union1 policyFunc - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; k_dim->x = core_dim; const uint64_t maximum_partitions = PAD_UP(size / pad_up_size, core_dim); if (maximum_partitions < core_number) { @@ -66,7 +66,7 @@ void binaryOpBlockPolicyFunc(mluOpHandle_t handle, cnrtFunctionType_t &k_type, size_t &normal_core_elem_num, size_t &tail_core_elem_num) { - k_type = CNRT_FUNC_TYPE_BLOCK; + k_type = cnrtFuncTypeBlock; const uint32_t core_number = mluop::runtime::getMaxParallelJobNum(handle, k_type); const size_t element_num = mluOpGetTensorElementNum(desc); diff --git a/kernels/binary_op/binary_op_stride_3pipeline.h b/kernels/binary_op/binary_op_stride_3pipeline.h index 193b229c3..10515cd68 100644 --- a/kernels/binary_op/binary_op_stride_3pipeline.h +++ b/kernels/binary_op/binary_op_stride_3pipeline.h @@ -35,20 +35,20 @@ #define BINARY_NRAM_SIZE (MAX_NRAM_SIZE + REM_FOR_STACK - 112 * 1024) #define BINARY_SRAM_SIZE (CORE_DIM * BINARY_NRAM_SIZE) -#define BINARY_OP_PIP3_WITH_STRIDE_DECLARE(Op, Prefer) \ - template \ - __mlu_global__ void MLUBlockKernelBinaryPipe3WithStride##Op##Prefer( \ - char *x, TensorShape x_shape, char *y, TensorShape y_shape, char *z, \ - TensorShape z_shape, size_t element_num, \ +#define BINARY_OP_PIP3_WITH_STRIDE_DECLARE(Op, Prefer) \ + template \ + __mlu_global__ void MLUBlockKernelBinaryPipe3WithStride##Op##Prefer( \ + int8_t *x, TensorShape x_shape, int8_t *y, TensorShape y_shape, \ + int8_t *z, TensorShape z_shape, size_t element_num, \ mluOpComputationPreference_t prefer, Args... args); #define BINARY_OP_PIP3_WITH_STRIDE_KERNEL(Op, Prefer) \ template \ __mlu_global__ void MLUBlockKernelBinaryPipe3WithStride##Op##Prefer( \ - char *x, TensorShape x_shape, char *y, TensorShape y_shape, char *z, \ - TensorShape z_shape, size_t element_num, \ + int8_t *x, TensorShape x_shape, int8_t *y, TensorShape y_shape, \ + int8_t *z, TensorShape z_shape, size_t element_num, \ mluOpComputationPreference_t prefer, Args... args) { \ if (__is_mpu()) { \ return; \ @@ -70,12 +70,12 @@ int repeat = num_per_core / span_num_deal; \ size_t rem = num_per_core % span_num_deal; \ int32_t align_rem = PAD_UP(rem, align_num); \ - char *ping_output = nram_buffer; \ - char *ping_input1 = nram_buffer + output_input1_gap; \ - char *ping_input2 = nram_buffer + output_input2_gap; \ - char *auxiliary_a = nram_buffer + auxiliary_a_gap; \ - char *auxiliary_b = nram_buffer + auxiliary_b_gap; \ - char *auxiliary_c = nram_buffer + auxiliary_c_gap; \ + int8_t *ping_output = nram_buffer; \ + int8_t *ping_input1 = nram_buffer + output_input1_gap; \ + int8_t *ping_input2 = nram_buffer + output_input2_gap; \ + int8_t *auxiliary_a = nram_buffer + auxiliary_a_gap; \ + int8_t *auxiliary_b = nram_buffer + auxiliary_b_gap; \ + int8_t *auxiliary_c = nram_buffer + auxiliary_c_gap; \ \ if (repeat > 0) { \ offset = core_offset; \ diff --git a/kernels/binary_op/binary_op_tensor_scalar_3pipeline.h b/kernels/binary_op/binary_op_tensor_scalar_3pipeline.h index fe56ac708..8b9ec7064 100644 --- a/kernels/binary_op/binary_op_tensor_scalar_3pipeline.h +++ b/kernels/binary_op/binary_op_tensor_scalar_3pipeline.h @@ -50,7 +50,7 @@ template \ __mlu_global__ void MLUBlockKernelBinaryTensorScalarPipe3##Op##Prefer( \ - char *input_tensor, char *input_scalar, char *output, \ + int8_t *input_tensor, int8_t *input_scalar, int8_t *output, \ uint32_t host_scalar, mluOpPointerMode_t pointer_mode, \ size_t element_num, Args... args); @@ -58,7 +58,7 @@ template \ __mlu_global__ void MLUBlockKernelBinaryTensorScalarPipe3##Op##Prefer( \ - char *input_tensor, char *input_scalar, char *output_tensor, \ + int8_t *input_tensor, int8_t *input_scalar, int8_t *output_tensor, \ uint32_t host_scalar, mluOpPointerMode_t pointer_mode, \ size_t element_num, Args... args) { \ if (__is_mpu()) { \ @@ -86,12 +86,12 @@ int32_t repeat = num_per_core / span_num_deal; \ size_t rem = num_per_core % span_num_deal; \ size_t align_rem = PAD_UP(rem, align_num); \ - char *ping_output = nram_buffer; \ - char *ping_input = nram_buffer + output_input_gap; \ - char *auxiliary_a = nram_buffer + auxiliary_a_gap; \ - char *auxiliary_b = nram_buffer + auxiliary_b_gap; \ - char *auxiliary_c = nram_buffer + auxiliary_c_gap; \ - char *scalar = nram_buffer + BINARY_NRAM_SIZE; \ + int8_t *ping_output = nram_buffer; \ + int8_t *ping_input = nram_buffer + output_input_gap; \ + int8_t *auxiliary_a = nram_buffer + auxiliary_a_gap; \ + int8_t *auxiliary_b = nram_buffer + auxiliary_b_gap; \ + int8_t *auxiliary_c = nram_buffer + auxiliary_c_gap; \ + int8_t *scalar = nram_buffer + BINARY_NRAM_SIZE; \ if (pointer_mode == MLUOP_POINTER_MODE_HOST) { \ ((DType_scalar *)scalar)[0] = *((DType_scalar *)&host_scalar); \ } else { \ @@ -170,18 +170,19 @@ template \ __mlu_global__ void MLUBlockKernelBinaryStrideTensorScalarPipe3##Op##Prefer( \ - char *input_tensor, TensorShape input_tensor_shape, char *input_scalar, \ - char *output, TensorShape output_shape, uint32_t host_scalar, \ - mluOpPointerMode_t pointer_mode, size_t element_num, Args... args); + int8_t *input_tensor, TensorShape input_tensor_shape, \ + int8_t *input_scalar, int8_t *output, TensorShape output_shape, \ + uint32_t host_scalar, mluOpPointerMode_t pointer_mode, \ + size_t element_num, Args... args); #define BINARY_OP_STRIDE_TENSOR_SCALAR_PIP3_KERNEL(Op, Prefer) \ template \ __mlu_global__ void MLUBlockKernelBinaryStrideTensorScalarPipe3##Op##Prefer( \ - char *input_tensor, TensorShape input_tensor_shape, char *input_scalar, \ - char *output_tensor, TensorShape output_tensor_shape, \ - uint32_t host_scalar, mluOpPointerMode_t pointer_mode, \ - size_t element_num, Args... args) { \ + int8_t *input_tensor, TensorShape input_tensor_shape, \ + int8_t *input_scalar, int8_t *output_tensor, \ + TensorShape output_tensor_shape, uint32_t host_scalar, \ + mluOpPointerMode_t pointer_mode, size_t element_num, Args... args) { \ if (__is_mpu()) { \ return; \ } \ @@ -206,12 +207,12 @@ int32_t repeat = num_per_core / span_num_deal; \ size_t rem = num_per_core % span_num_deal; \ size_t align_rem = PAD_UP(rem, align_num); \ - char *ping_output = nram_buffer; \ - char *ping_input = nram_buffer + output_input_gap; \ - char *auxiliary_a = nram_buffer + auxiliary_a_gap; \ - char *auxiliary_b = nram_buffer + auxiliary_b_gap; \ - char *auxiliary_c = nram_buffer + auxiliary_c_gap; \ - char *scalar = nram_buffer + BINARY_NRAM_SIZE; \ + int8_t *ping_output = nram_buffer; \ + int8_t *ping_input = nram_buffer + output_input_gap; \ + int8_t *auxiliary_a = nram_buffer + auxiliary_a_gap; \ + int8_t *auxiliary_b = nram_buffer + auxiliary_b_gap; \ + int8_t *auxiliary_c = nram_buffer + auxiliary_c_gap; \ + int8_t *scalar = nram_buffer + BINARY_NRAM_SIZE; \ if (pointer_mode == MLUOP_POINTER_MODE_HOST) { \ ((DType_scalar *)scalar)[0] = *((DType_scalar *)&host_scalar); \ } else { \ diff --git a/kernels/border_align/border_align_backward/border_align_backward.cpp b/kernels/border_align/border_align_backward/border_align_backward.cpp index dc68039ec..c507de17a 100644 --- a/kernels/border_align/border_align_backward/border_align_backward.cpp +++ b/kernels/border_align/border_align_backward/border_align_backward.cpp @@ -36,7 +36,7 @@ static void policyFunc(mluOpHandle_t handle, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) { - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; k_dim->x = mluop::runtime::getCoreNumOfEachUnionCapability(handle); k_dim->y = mluop::runtime::getClusterLimitCapability(handle); k_dim->z = 1; diff --git a/kernels/border_align/border_align_backward/border_align_backward_union1.mlu b/kernels/border_align/border_align_backward/border_align_backward_union1.mlu index 61c23ad01..f475a9afb 100644 --- a/kernels/border_align/border_align_backward/border_align_backward_union1.mlu +++ b/kernels/border_align/border_align_backward/border_align_backward_union1.mlu @@ -27,7 +27,7 @@ #include "kernels/kernel.h" #include "kernels/utils/common.h" -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; #define BORDER_NUM 4 #define CALCULATE_GRAD_INPUT(w, x, y) \ @@ -36,8 +36,8 @@ __nram__ char nram_buffer[MAX_NRAM_SIZE]; x * origin_c * BORDER_NUM + border * origin_c + \ c; \ __bang_mul_scalar(nram_grad_input, nram_grad_output, w, deal_num_align); \ - __bang_band((char *)nram_grad_input, (char *)nram_grad_input, (char *)mask, \ - sizeof(T) * deal_num_align); \ + __bang_band((int8_t *)nram_grad_input, (int8_t *)nram_grad_input, \ + (int8_t *)mask, sizeof(T) * deal_num_align); \ __bang_atomic_reduce_add(grad_input + offset_##w, nram_grad_input, deal_num); template @@ -177,15 +177,17 @@ __mlu_func__ void computeImpl(T *nram_grad_output, const T *grad_output, deal_num_align); // NOLINT if (__mluop_is_float()) { __nram__ int32_t table[COMPUTE_COUNT_ALIGN] = {0, (int32_t)0xffffffff}; - __bang_lut_s32((int32_t *)nram_argmax_idx, (int32_t *)nram_argmax_idx, - table, deal_num_align, COMPUTE_COUNT_ALIGN); // NOLINT + __bang_lut((int32_t *)nram_argmax_idx, (uint32_t *)nram_argmax_idx, + table, (uint32_t)deal_num_align, + COMPUTE_COUNT_ALIGN); // NOLINT } else { __nram__ int16_t table[COMPUTE_COUNT_ALIGN] = {0, (int16_t)0xffff}; __bang_int322int16((int16_t *)nram_argmax_idx, (int32_t *)nram_argmax_idx, deal_num_align, 0, 0); // NOLINT - __bang_lut_s16((int16_t *)nram_argmax_idx, (int16_t *)nram_argmax_idx, - table, deal_num_align, COMPUTE_COUNT_ALIGN); // NOLINT + __bang_lut((int16_t *)nram_argmax_idx, (uint16_t *)nram_argmax_idx, + table, (uint32_t)deal_num_align, + COMPUTE_COUNT_ALIGN); // NOLINT } // load grad_output, and calculate grad_input @@ -228,7 +230,7 @@ __mlu_global__ void MLUKernelBorderAlignBackward( (MAX_NRAM_SIZE - NFU_ALIGN_SIZE) / (2 * sizeof(T) + 1 * sizeof(int32_t)), NFU_ALIGN_SIZE); T *nram_boxes = (T *)nram_buffer; - T *nram_grad_output = (T *)((char *)nram_buffer + NFU_ALIGN_SIZE); + T *nram_grad_output = (T *)((int8_t *)nram_buffer + NFU_ALIGN_SIZE); T *nram_grad_input = (T *)nram_grad_output + deal_num; int32_t *nram_argmax_idx = (int32_t *)((T *)nram_grad_input + deal_num); diff --git a/kernels/border_align/border_align_forward/border_align_forward.cpp b/kernels/border_align/border_align_forward/border_align_forward.cpp index dd6787888..d57601c36 100644 --- a/kernels/border_align/border_align_forward/border_align_forward.cpp +++ b/kernels/border_align/border_align_forward/border_align_forward.cpp @@ -34,7 +34,7 @@ // policyFunc static void policyFunc(mluOpHandle_t handle, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) { - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; k_dim->x = mluop::runtime::getCoreNumOfEachUnionCapability(handle); k_dim->y = mluop::runtime::getClusterLimitCapability(handle); k_dim->z = 1; diff --git a/kernels/border_align/border_align_forward/border_align_forward_union1.mlu b/kernels/border_align/border_align_forward/border_align_forward_union1.mlu index b18449b6f..840206ba2 100644 --- a/kernels/border_align/border_align_forward/border_align_forward_union1.mlu +++ b/kernels/border_align/border_align_forward/border_align_forward_union1.mlu @@ -29,7 +29,7 @@ #define BORDER_NUM 4 -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; template __mlu_func__ void bilinearInterpolate(const int32_t input_height, @@ -303,7 +303,7 @@ __mlu_global__ void MLUKernelBorderAlignForward( T *input_ping_nram = (T *)nram_buffer; T *output_nram = input_ping_nram + pingpong_split_num * deal_num; T *boxes_nram = output_nram + deal_num; - int32_t *argmax_idx_nram = (int32_t *)((char *)boxes_nram + NFU_ALIGN_SIZE); + int32_t *argmax_idx_nram = (int32_t *)((int8_t *)boxes_nram + NFU_ALIGN_SIZE); /* * input.shape = [origin_n, origin_h, origin_w, border_num * origin_c] diff --git a/kernels/box_iou_rotated/box_iou_rotated.cpp b/kernels/box_iou_rotated/box_iou_rotated.cpp index 11b4c1f4e..f10044d5e 100644 --- a/kernels/box_iou_rotated/box_iou_rotated.cpp +++ b/kernels/box_iou_rotated/box_iou_rotated.cpp @@ -39,7 +39,7 @@ static void policyFunc(const mluOpHandle_t handle, cnrtDim3_t *k_dim, const int num_box1, const int num_box2) { // When current MLU arch only support Block type job if (mluop::runtime::getJobLimitCapability(handle) == CN_KERNEL_CLASS_BLOCK) { - *k_type = CNRT_FUNC_TYPE_BLOCK; + *k_type = cnrtFuncTypeBlock; k_dim->x = 1; k_dim->y = 1; k_dim->z = 1; @@ -48,7 +48,7 @@ static void policyFunc(const mluOpHandle_t handle, cnrtDim3_t *k_dim, } // union1 policy func - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; // dimx equals to num of mlu cores in each cluster k_dim->x = mluop::runtime::getCoreNumOfEachUnionCapability(handle); // dimy equals to num of current available clusters @@ -59,7 +59,7 @@ static void policyFunc(const mluOpHandle_t handle, cnrtDim3_t *k_dim, const uint32_t single_core_small_case = 64; if (single_core_small_case >= num_box1) { // only 1 mlu core enough - *k_type = CNRT_FUNC_TYPE_BLOCK; + *k_type = cnrtFuncTypeBlock; k_dim->x = 1; k_dim->y = 1; VLOG(5) << "Launch Kernel MLUKernelBoxIouRotated in BLOCK type"; diff --git a/kernels/box_iou_rotated/box_iou_rotated_aligned.h b/kernels/box_iou_rotated/box_iou_rotated_aligned.h index 5cd7bbf9f..67a948ffe 100644 --- a/kernels/box_iou_rotated/box_iou_rotated_aligned.h +++ b/kernels/box_iou_rotated/box_iou_rotated_aligned.h @@ -62,8 +62,7 @@ __mlu_func__ void MLUUnion1BoxIouRotatedAligned(const T *box1, const T *box2, const uint32_t max_box_pair = FLOOR_ALIGN(MAX_NRAM_SIZE / copies_of_nram, COMPUTE_COUNT_ALIGN); // First, initialize ram with all 0, or could cause nan/inf unexcepted results - __bang_write_zero((unsigned char *)nram_buffer, - copies_of_nram * max_box_pair); + __bang_write_zero((uint8_t *)nram_buffer, copies_of_nram * max_box_pair); void *box1_trans = nram_buffer + 4 * max_box_pair * sizeof(T); void *box2_trans = @@ -74,50 +73,55 @@ __mlu_func__ void MLUUnion1BoxIouRotatedAligned(const T *box1, const T *box2, nram_buffer + (3 * COMPUTE_COUNT_ALIGN + 4) * max_box_pair * sizeof(T); // After transpose, box1/2_onchip data can be over-written - void *temp1_ram = (char *)box2_onchip; - void *temp2_ram = ((char *)box2_onchip) + 1 * max_box_pair * sizeof(T); - void *temp3_ram = ((char *)box2_onchip) + 2 * max_box_pair * sizeof(T); + void *temp1_ram = (int8_t *)box2_onchip; + void *temp2_ram = ((int8_t *)box2_onchip) + 1 * max_box_pair * sizeof(T); + void *temp3_ram = ((int8_t *)box2_onchip) + 2 * max_box_pair * sizeof(T); void *valid_box = nram_buffer; void *area1_ram = nram_buffer + 1 * max_box_pair * sizeof(T); void *area2_ram = nram_buffer + 2 * max_box_pair * sizeof(T); void *ious_ram = nram_buffer + 3 * max_box_pair * sizeof(T); // Rotated vertices, each box has 4 vertices, each point has (x, y) - void *rotated_pts1_x = ((char *)box1_onchip) + 48 * max_box_pair * sizeof(T); - void *rotated_pts1_y = ((char *)box1_onchip) + 52 * max_box_pair * sizeof(T); - void *rotated_pts2_x = ((char *)box1_onchip) + 56 * max_box_pair * sizeof(T); - void *rotated_pts2_y = ((char *)box1_onchip) + 60 * max_box_pair * sizeof(T); - void *temp4_ram = ((char *)box2_onchip) + 52 * max_box_pair * sizeof(T); - void *temp5_ram = ((char *)box2_onchip) + 53 * max_box_pair * sizeof(T); + void *rotated_pts1_x = + ((int8_t *)box1_onchip) + 48 * max_box_pair * sizeof(T); + void *rotated_pts1_y = + ((int8_t *)box1_onchip) + 52 * max_box_pair * sizeof(T); + void *rotated_pts2_x = + ((int8_t *)box1_onchip) + 56 * max_box_pair * sizeof(T); + void *rotated_pts2_y = + ((int8_t *)box1_onchip) + 60 * max_box_pair * sizeof(T); + void *temp4_ram = ((int8_t *)box2_onchip) + 52 * max_box_pair * sizeof(T); + void *temp5_ram = ((int8_t *)box2_onchip) + 53 * max_box_pair * sizeof(T); // After calculating rotated vertices, box1/2_trans data can be over-written // Intersect points = [24xN] points, each point has (x, y) - void *intersect_pts_x = (char *)box1_onchip; - void *intersect_pts_y = ((char *)box1_onchip) + 24 * max_box_pair * sizeof(T); + void *intersect_pts_x = (int8_t *)box1_onchip; + void *intersect_pts_y = + ((int8_t *)box1_onchip) + 24 * max_box_pair * sizeof(T); // Record whether this position of intersect points is valid or not - void *valid_pts = (char *)box1_trans; + void *valid_pts = (int8_t *)box1_trans; // Record each box pair has how many valid intersect points - void *nums_in_ram = ((char *)box2_onchip) + 3 * max_box_pair * sizeof(T); + void *nums_in_ram = ((int8_t *)box2_onchip) + 3 * max_box_pair * sizeof(T); // Line vector, from p1 to p2 is: p1+(p2-p1)*t, t=[0,1] - void *vec1_x = ((char *)box1_trans) + 24 * max_box_pair * sizeof(T); - void *vec1_y = ((char *)box1_trans) + 28 * max_box_pair * sizeof(T); - void *vec2_x = ((char *)box1_trans) + 32 * max_box_pair * sizeof(T); - void *vec2_y = ((char *)box1_trans) + 36 * max_box_pair * sizeof(T); + void *vec1_x = ((int8_t *)box1_trans) + 24 * max_box_pair * sizeof(T); + void *vec1_y = ((int8_t *)box1_trans) + 28 * max_box_pair * sizeof(T); + void *vec2_x = ((int8_t *)box1_trans) + 32 * max_box_pair * sizeof(T); + void *vec2_y = ((int8_t *)box1_trans) + 36 * max_box_pair * sizeof(T); - void *temp6_ram = ((char *)box2_onchip) + 54 * max_box_pair * sizeof(T); - void *temp7_ram = ((char *)box2_onchip) + 55 * max_box_pair * sizeof(T); - void *temp8_ram = ((char *)box2_onchip) + 56 * max_box_pair * sizeof(T); - void *temp9_ram = ((char *)box2_onchip) + 57 * max_box_pair * sizeof(T); + void *temp6_ram = ((int8_t *)box2_onchip) + 54 * max_box_pair * sizeof(T); + void *temp7_ram = ((int8_t *)box2_onchip) + 55 * max_box_pair * sizeof(T); + void *temp8_ram = ((int8_t *)box2_onchip) + 56 * max_box_pair * sizeof(T); + void *temp9_ram = ((int8_t *)box2_onchip) + 57 * max_box_pair * sizeof(T); // Ordered points = [24xN] points, each point has (x, y) - void *ordered_pts_x = ((char *)box2_onchip) + 4 * max_box_pair * sizeof(T); - void *ordered_pts_y = ((char *)box2_onchip) + 28 * max_box_pair * sizeof(T); + void *ordered_pts_x = ((int8_t *)box2_onchip) + 4 * max_box_pair * sizeof(T); + void *ordered_pts_y = ((int8_t *)box2_onchip) + 28 * max_box_pair * sizeof(T); - void *dist_ram = ((char *)box1_trans) + 24 * max_box_pair * sizeof(T); - void *temp_long_1 = ((char *)box1_trans) + 48 * max_box_pair * sizeof(T); - void *temp_long_2 = ((char *)box1_trans) + 72 * max_box_pair * sizeof(T); - void *temp_long_3 = ((char *)box1_trans) + 96 * max_box_pair * sizeof(T); + void *dist_ram = ((int8_t *)box1_trans) + 24 * max_box_pair * sizeof(T); + void *temp_long_1 = ((int8_t *)box1_trans) + 48 * max_box_pair * sizeof(T); + void *temp_long_2 = ((int8_t *)box1_trans) + 72 * max_box_pair * sizeof(T); + void *temp_long_3 = ((int8_t *)box1_trans) + 96 * max_box_pair * sizeof(T); // load offchip current data, for loop uint32_t repeat = num_box / max_box_pair; @@ -267,16 +271,16 @@ __mlu_func__ void MLUUnion1BoxIouRotatedAligned(const T *box1, const T *box2, __nram__ int table[TABLE_LENGTH] = {0, FIILED_ONES}; __bang_float2int32((int32_t *)temp9_ram, (float *)temp9_ram, actual_compute_box_num, 0); - __bang_lut_s32((int32_t *)temp9_ram, (int32_t *)temp9_ram, - (int32_t *)table, actual_compute_box_num, TABLE_LENGTH); + __bang_lut((int32_t *)temp9_ram, (uint32_t *)temp9_ram, (int32_t *)table, + actual_compute_box_num, TABLE_LENGTH); } else { __nram__ int16_t table[TABLE_LENGTH] = {0, HALF_FILLED_ONES}; __bang_half2int16_rd((int16_t *)temp9_ram, (half *)temp9_ram, actual_compute_box_num, 0); - __bang_lut_s16((int16_t *)temp9_ram, (int16_t *)temp9_ram, - (int16_t *)table, actual_compute_box_num, TABLE_LENGTH); + __bang_lut((int16_t *)temp9_ram, (uint16_t *)temp9_ram, (int16_t *)table, + actual_compute_box_num, TABLE_LENGTH); } - __bang_band((char *)ious_ram, (char *)ious_ram, (char *)temp9_ram, + __bang_band((int8_t *)ious_ram, (int8_t *)ious_ram, (int8_t *)temp9_ram, actual_compute_box_num * sizeof(T)); __memcpy(ious + current_ious_offset, (T *)ious_ram, diff --git a/kernels/box_iou_rotated/box_iou_rotated_nonaligned.h b/kernels/box_iou_rotated/box_iou_rotated_nonaligned.h old mode 100755 new mode 100644 index 579328fd2..ec44b3abd --- a/kernels/box_iou_rotated/box_iou_rotated_nonaligned.h +++ b/kernels/box_iou_rotated/box_iou_rotated_nonaligned.h @@ -68,8 +68,7 @@ __mlu_func__ void MLUUnion1BoxIouRotatedNonAligned(const T *box1, const T *box2, const uint32_t max_box_pair = FLOOR_ALIGN(MAX_NRAM_SIZE / copies_of_nram, COMPUTE_COUNT_ALIGN); // First, initialize ram with all 0, or could cause nan/inf unexcepted results - __bang_write_zero((unsigned char *)nram_buffer, - copies_of_nram * max_box_pair); + __bang_write_zero((uint8_t *)nram_buffer, copies_of_nram * max_box_pair); void *box1_onchip = nram_buffer + 2 * max_box_pair * sizeof(T); void *box2_onchip = @@ -84,43 +83,46 @@ __mlu_func__ void MLUUnion1BoxIouRotatedNonAligned(const T *box1, const T *box2, void *ious_ram = nram_buffer + 1 * max_box_pair * sizeof(T); void *valid_box = nram_buffer + 1 * max_box_pair * sizeof(T); - void *new_pts2 = ((char *)box2_trans) + 5 * max_box_pair * sizeof(T); + void *new_pts2 = ((int8_t *)box2_trans) + 5 * max_box_pair * sizeof(T); // over-written Intersect points = [24xN] points, each point has (x, y) - void *intersect_pts_x = ((char *)box1_trans) + 16 * max_box_pair * sizeof(T); - void *intersect_pts_y = ((char *)box1_trans) + 40 * max_box_pair * sizeof(T); + void *intersect_pts_x = + ((int8_t *)box1_trans) + 16 * max_box_pair * sizeof(T); + void *intersect_pts_y = + ((int8_t *)box1_trans) + 40 * max_box_pair * sizeof(T); // Record whether this position of intersect points is valid or not - void *valid_pts = ((char *)box1_onchip) + 40 * max_box_pair * sizeof(T); + void *valid_pts = ((int8_t *)box1_onchip) + 40 * max_box_pair * sizeof(T); // Record each box pair has how many valid intersect points - void *nums_in_ram = ((char *)box1_onchip) + 10 * max_box_pair * sizeof(T); - - void *rotated_pts1_x = ((char *)box2_onchip); - void *rotated_pts1_y = ((char *)box2_onchip) + 4 * max_box_pair * sizeof(T); - void *rotated_pts2_x = ((char *)box2_onchip) + 8 * max_box_pair * sizeof(T); - void *rotated_pts2_y = ((char *)box2_onchip) + 12 * max_box_pair * sizeof(T); - - void *temp1_ram = ((char *)box1_onchip) + 5 * max_box_pair * sizeof(T); - void *temp2_ram = ((char *)box1_onchip) + 6 * max_box_pair * sizeof(T); - void *temp3_ram = ((char *)box1_onchip) + 7 * max_box_pair * sizeof(T); - void *temp4_ram = ((char *)box1_onchip) + 8 * max_box_pair * sizeof(T); - void *temp5_ram = ((char *)box1_onchip) + 9 * max_box_pair * sizeof(T); - void *temp6_ram = ((char *)box1_onchip) + 11 * max_box_pair * sizeof(T); - void *temp7_ram = ((char *)box1_onchip) + 12 * max_box_pair * sizeof(T); - void *temp8_ram = ((char *)box1_onchip) + 13 * max_box_pair * sizeof(T); - void *temp9_ram = ((char *)box1_onchip) + 14 * max_box_pair * sizeof(T); - - void *vec1_x = ((char *)box2_onchip) + 16 * max_box_pair * sizeof(T); - void *vec1_y = ((char *)box2_onchip) + 20 * max_box_pair * sizeof(T); - void *vec2_x = ((char *)box2_onchip) + 24 * max_box_pair * sizeof(T); - void *vec2_y = ((char *)box2_onchip) + 28 * max_box_pair * sizeof(T); - - void *ordered_pts_x = ((char *)box2_trans) + 16 * max_box_pair * sizeof(T); - void *ordered_pts_y = ((char *)box2_trans) + 40 * max_box_pair * sizeof(T); - - void *dist_ram = ((char *)box1_onchip) + 16 * max_box_pair * sizeof(T); - void *temp_long_1 = ((char *)box2_onchip); - void *temp_long_2 = ((char *)box2_onchip) + 24 * max_box_pair * sizeof(T); - void *temp_long_3 = ((char *)box2_onchip) + 48 * max_box_pair * sizeof(T); + void *nums_in_ram = ((int8_t *)box1_onchip) + 10 * max_box_pair * sizeof(T); + + void *rotated_pts1_x = ((int8_t *)box2_onchip); + void *rotated_pts1_y = ((int8_t *)box2_onchip) + 4 * max_box_pair * sizeof(T); + void *rotated_pts2_x = ((int8_t *)box2_onchip) + 8 * max_box_pair * sizeof(T); + void *rotated_pts2_y = + ((int8_t *)box2_onchip) + 12 * max_box_pair * sizeof(T); + + void *temp1_ram = ((int8_t *)box1_onchip) + 5 * max_box_pair * sizeof(T); + void *temp2_ram = ((int8_t *)box1_onchip) + 6 * max_box_pair * sizeof(T); + void *temp3_ram = ((int8_t *)box1_onchip) + 7 * max_box_pair * sizeof(T); + void *temp4_ram = ((int8_t *)box1_onchip) + 8 * max_box_pair * sizeof(T); + void *temp5_ram = ((int8_t *)box1_onchip) + 9 * max_box_pair * sizeof(T); + void *temp6_ram = ((int8_t *)box1_onchip) + 11 * max_box_pair * sizeof(T); + void *temp7_ram = ((int8_t *)box1_onchip) + 12 * max_box_pair * sizeof(T); + void *temp8_ram = ((int8_t *)box1_onchip) + 13 * max_box_pair * sizeof(T); + void *temp9_ram = ((int8_t *)box1_onchip) + 14 * max_box_pair * sizeof(T); + + void *vec1_x = ((int8_t *)box2_onchip) + 16 * max_box_pair * sizeof(T); + void *vec1_y = ((int8_t *)box2_onchip) + 20 * max_box_pair * sizeof(T); + void *vec2_x = ((int8_t *)box2_onchip) + 24 * max_box_pair * sizeof(T); + void *vec2_y = ((int8_t *)box2_onchip) + 28 * max_box_pair * sizeof(T); + + void *ordered_pts_x = ((int8_t *)box2_trans) + 16 * max_box_pair * sizeof(T); + void *ordered_pts_y = ((int8_t *)box2_trans) + 40 * max_box_pair * sizeof(T); + + void *dist_ram = ((int8_t *)box1_onchip) + 16 * max_box_pair * sizeof(T); + void *temp_long_1 = ((int8_t *)box2_onchip); + void *temp_long_2 = ((int8_t *)box2_onchip) + 24 * max_box_pair * sizeof(T); + void *temp_long_3 = ((int8_t *)box2_onchip) + 48 * max_box_pair * sizeof(T); // load offchip current data, for loop uint32_t repeat_box1 = num_box1 / max_box_pair; @@ -358,18 +360,16 @@ __mlu_func__ void MLUUnion1BoxIouRotatedNonAligned(const T *box1, const T *box2, __nram__ int table[TABLE_LENGTH] = {0, FIILED_ONES}; __bang_float2int32((int32_t *)temp9_ram, (float *)temp9_ram, actual_compute_box_num, 0); - __bang_lut_s32((int32_t *)temp9_ram, (int32_t *)temp9_ram, - (int32_t *)table, actual_compute_box_num, - TABLE_LENGTH); + __bang_lut((int32_t *)temp9_ram, (uint32_t *)temp9_ram, + (int32_t *)table, actual_compute_box_num, TABLE_LENGTH); } else { __nram__ int16_t table[TABLE_LENGTH] = {0, HALF_FILLED_ONES}; __bang_half2int16_rd((int16_t *)temp9_ram, (half *)temp9_ram, actual_compute_box_num, 0); - __bang_lut_s16((int16_t *)temp9_ram, (int16_t *)temp9_ram, - (int16_t *)table, actual_compute_box_num, - TABLE_LENGTH); + __bang_lut((int16_t *)temp9_ram, (uint16_t *)temp9_ram, + (int16_t *)table, actual_compute_box_num, TABLE_LENGTH); } - __bang_band((char *)ious_ram, (char *)ious_ram, (char *)temp9_ram, + __bang_band((int8_t *)ious_ram, (int8_t *)ious_ram, (int8_t *)temp9_ram, actual_compute_box_num * sizeof(T)); __memcpy(ious + current_ious_offset, (T *)ious_ram, diff --git a/kernels/box_iou_rotated/box_iou_rotated_utils.h b/kernels/box_iou_rotated/box_iou_rotated_utils.h index 19d7a786b..7c3e8d270 100644 --- a/kernels/box_iou_rotated/box_iou_rotated_utils.h +++ b/kernels/box_iou_rotated/box_iou_rotated_utils.h @@ -31,7 +31,7 @@ // each box data contains 5 number: x, y, w, h, a #define SINGLE_BOX_DIM 5 -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; // cross2d(A, B) = A.x * B.y - A.y * B.x; template @@ -248,19 +248,19 @@ __mlu_func__ void getIntersectionPoints( __nram__ int table[TABLE_LENGTH] = {0, FIILED_ONES}; __bang_float2int32((int32_t *)temp2_ram, (float *)temp1_ram, actual_compute_box_num, 0); - __bang_lut_s32((int32_t *)temp2_ram, (int32_t *)temp2_ram, - (int32_t *)table, actual_compute_box_num, TABLE_LENGTH); + __bang_lut((int32_t *)temp2_ram, (uint32_t *)temp2_ram, + (int32_t *)table, actual_compute_box_num, TABLE_LENGTH); } else { __nram__ int16_t table[TABLE_LENGTH] = {0, HALF_FILLED_ONES}; __bang_half2int16_rd((int16_t *)temp2_ram, (half *)temp2_ram, actual_compute_box_num, 0); - __bang_lut_s16((int16_t *)temp2_ram, (int16_t *)temp2_ram, - (int16_t *)table, actual_compute_box_num, TABLE_LENGTH); + __bang_lut((int16_t *)temp2_ram, (uint16_t *)temp2_ram, + (int16_t *)table, actual_compute_box_num, TABLE_LENGTH); } - __bang_band( - (char *)((T *)intersect_pts_x + (4 * i + j) * actual_compute_box_num), - (char *)temp7_ram, (char *)temp2_ram, - actual_compute_box_num * sizeof(T)); + __bang_band((int8_t *)((T *)intersect_pts_x + + (4 * i + j) * actual_compute_box_num), + (int8_t *)temp7_ram, (int8_t *)temp2_ram, + actual_compute_box_num * sizeof(T)); __bang_mul((T *)temp7_ram, (T *)vec1_y + i * actual_compute_box_num, (T *)temp6_ram, actual_compute_box_num); @@ -268,10 +268,10 @@ __mlu_func__ void getIntersectionPoints( (T *)rotated_pts1_y + i * actual_compute_box_num, (T *)temp7_ram, actual_compute_box_num); - __bang_band( - (char *)((T *)intersect_pts_y + (4 * i + j) * actual_compute_box_num), - (char *)temp7_ram, (char *)temp2_ram, - actual_compute_box_num * sizeof(T)); + __bang_band((int8_t *)((T *)intersect_pts_y + + (4 * i + j) * actual_compute_box_num), + (int8_t *)temp7_ram, (int8_t *)temp2_ram, + actual_compute_box_num * sizeof(T)); // Assign `valid_pts` bit and accumulate `nums_in` of valid points of each // box pair @@ -336,23 +336,23 @@ __mlu_func__ void getIntersectionPoints( __nram__ int table[TABLE_LENGTH] = {0, FIILED_ONES}; __bang_float2int32((int32_t *)temp2_ram, (float *)temp1_ram, actual_compute_box_num, 0); - __bang_lut_s32((int32_t *)temp2_ram, (int32_t *)temp2_ram, - (int32_t *)table, actual_compute_box_num, TABLE_LENGTH); + __bang_lut((int32_t *)temp2_ram, (uint32_t *)temp2_ram, (int32_t *)table, + actual_compute_box_num, TABLE_LENGTH); } else { __nram__ int16_t table[TABLE_LENGTH] = {0, HALF_FILLED_ONES}; __bang_half2int16_rd((int16_t *)temp2_ram, (half *)temp1_ram, actual_compute_box_num, 0); - __bang_lut_s16((int16_t *)temp2_ram, (int16_t *)temp2_ram, - (int16_t *)table, actual_compute_box_num, TABLE_LENGTH); + __bang_lut((int16_t *)temp2_ram, (uint16_t *)temp2_ram, (int16_t *)table, + actual_compute_box_num, TABLE_LENGTH); } __bang_band( - (char *)((T *)intersect_pts_x + (16 + i) * actual_compute_box_num), - (char *)((T *)rotated_pts1_x + i * actual_compute_box_num), - (char *)temp2_ram, actual_compute_box_num * sizeof(T)); + (int8_t *)((T *)intersect_pts_x + (16 + i) * actual_compute_box_num), + (int8_t *)((T *)rotated_pts1_x + i * actual_compute_box_num), + (int8_t *)temp2_ram, actual_compute_box_num * sizeof(T)); __bang_band( - (char *)((T *)intersect_pts_y + (16 + i) * actual_compute_box_num), - (char *)((T *)rotated_pts1_y + i * actual_compute_box_num), - (char *)temp2_ram, actual_compute_box_num * sizeof(T)); + (int8_t *)((T *)intersect_pts_y + (16 + i) * actual_compute_box_num), + (int8_t *)((T *)rotated_pts1_y + i * actual_compute_box_num), + (int8_t *)temp2_ram, actual_compute_box_num * sizeof(T)); // assign valid_pts bit and accumulate nums of valid points of each box pair __bang_or((T *)valid_pts + (16 + i) * actual_compute_box_num, @@ -412,23 +412,23 @@ __mlu_func__ void getIntersectionPoints( __nram__ int table[TABLE_LENGTH] = {0, FIILED_ONES}; __bang_float2int32((int32_t *)temp2_ram, (float *)temp1_ram, actual_compute_box_num, 0); - __bang_lut_s32((int32_t *)temp2_ram, (int32_t *)temp2_ram, - (int32_t *)table, actual_compute_box_num, TABLE_LENGTH); + __bang_lut((int32_t *)temp2_ram, (uint32_t *)temp2_ram, (int32_t *)table, + actual_compute_box_num, TABLE_LENGTH); } else { __nram__ int16_t table[TABLE_LENGTH] = {0, HALF_FILLED_ONES}; __bang_half2int16_rd((int16_t *)temp2_ram, (half *)temp1_ram, actual_compute_box_num, 0); - __bang_lut_s16((int16_t *)temp2_ram, (int16_t *)temp2_ram, - (int16_t *)table, actual_compute_box_num, TABLE_LENGTH); + __bang_lut((int16_t *)temp2_ram, (uint16_t *)temp2_ram, (int16_t *)table, + actual_compute_box_num, TABLE_LENGTH); } __bang_band( - (char *)((T *)intersect_pts_x + (20 + i) * actual_compute_box_num), - (char *)((T *)rotated_pts2_x + i * actual_compute_box_num), - (char *)temp2_ram, actual_compute_box_num * sizeof(T)); + (int8_t *)((T *)intersect_pts_x + (20 + i) * actual_compute_box_num), + (int8_t *)((T *)rotated_pts2_x + i * actual_compute_box_num), + (int8_t *)temp2_ram, actual_compute_box_num * sizeof(T)); __bang_band( - (char *)((T *)intersect_pts_y + (20 + i) * actual_compute_box_num), - (char *)((T *)rotated_pts2_y + i * actual_compute_box_num), - (char *)temp2_ram, actual_compute_box_num * sizeof(T)); + (int8_t *)((T *)intersect_pts_y + (20 + i) * actual_compute_box_num), + (int8_t *)((T *)rotated_pts2_y + i * actual_compute_box_num), + (int8_t *)temp2_ram, actual_compute_box_num * sizeof(T)); // assign valid_pts bit and accumulate nums of valid points of each box pair __bang_or((T *)valid_pts + (20 + i) * actual_compute_box_num, diff --git a/kernels/carafe/carafe.cpp b/kernels/carafe/carafe.cpp index c55b9b7f9..e4cc09b03 100644 --- a/kernels/carafe/carafe.cpp +++ b/kernels/carafe/carafe.cpp @@ -254,7 +254,7 @@ mluOpStatus_t genPolicy(mluOpHandle_t handle, << "NRAM usage (Nb. of dtype_size) = " << nram_usage; // determine task type and dims - *k_type = CNRT_FUNC_TYPE_BLOCK; + *k_type = cnrtFuncTypeBlock; k_dim->x = core_dim; k_dim->y = union_number; k_dim->z = 1; @@ -783,6 +783,11 @@ mluOpStatus_t MLUOP_WIN_API mluOpCarafeForward( &grid_dimG, &grid_dimC, &job_num), "Error occured in generating policy."); + { + LARGE_TENSOR_CHECK("[mluOpCarafeForward]", input_desc); + LARGE_TENSOR_CHECK("[mluOpCarafeForward]", mask_desc); + LARGE_TENSOR_CHECK("[mluOpCarafeForward]", output_desc); + } // GEN_CASE if (MLUOP_GEN_CASE_ON_NEW) { GEN_CASE_START("carafe_forward", "CARAFE_FORWARD"); @@ -790,12 +795,12 @@ mluOpStatus_t MLUOP_WIN_API mluOpCarafeForward( GEN_CASE_DATA(true, "input", input, input_desc, 5.1, -5.3); GEN_CASE_DATA(true, "mask", mask, mask_desc, 0.0, 1.0); GEN_CASE_DATA(false, "output", output, output_desc, 1.7, -1.8); - GEN_CASE_OP_PARAM_SINGLE(0, "carafe_forward", "dimnb", carafe_desc->dimNb); - GEN_CASE_OP_PARAM_SINGLE(1, "carafe_forward", "kernel_size", + GEN_CASE_OP_PARAM_SINGLE(0, "carafe", "dimnb", carafe_desc->dimNb); + GEN_CASE_OP_PARAM_SINGLE(1, "carafe", "kernel_size", carafe_desc->kernel_size); - GEN_CASE_OP_PARAM_SINGLE(1, "carafe_forward", "group_size", + GEN_CASE_OP_PARAM_SINGLE(1, "carafe", "group_size", carafe_desc->group_size); - GEN_CASE_OP_PARAM_SINGLE(2, "carafe_forward", "scale_factor", + GEN_CASE_OP_PARAM_SINGLE(2, "carafe", "scale_factor", carafe_desc->scale_factor); GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0); } @@ -840,6 +845,14 @@ mluOpStatus_t MLUOP_WIN_API mluOpCarafeBackward( return param_check_status; } + { + LARGE_TENSOR_CHECK("[mluOpCarafeBackward]", input_desc); + LARGE_TENSOR_CHECK("[mluOpCarafeBackward]", mask_desc); + LARGE_TENSOR_CHECK("[mluOpCarafeBackward]", grad_output_desc); + LARGE_TENSOR_CHECK("[mluOpCarafeBackward]", grad_input_desc); + LARGE_TENSOR_CHECK("[mluOpCarafeBackward]", grad_mask_desc); + } + if (MLUOP_GEN_CASE_ON_NEW) { GEN_CASE_START("carafe_backward", "CARAFE_BACKWARD"); GEN_CASE_HANDLE(handle); @@ -849,12 +862,12 @@ mluOpStatus_t MLUOP_WIN_API mluOpCarafeBackward( -1.8); GEN_CASE_DATA(false, "grad_input", grad_input, grad_input_desc, 0, 0); GEN_CASE_DATA(false, "grad_mask", grad_mask, grad_mask_desc, 0, 0); - GEN_CASE_OP_PARAM_SINGLE(0, "carafe_backward", "dimnb", carafe_desc->dimNb); - GEN_CASE_OP_PARAM_SINGLE(1, "carafe_backward", "kernel_size", + GEN_CASE_OP_PARAM_SINGLE(0, "carafe", "dimnb", carafe_desc->dimNb); + GEN_CASE_OP_PARAM_SINGLE(1, "carafe", "kernel_size", carafe_desc->kernel_size); - GEN_CASE_OP_PARAM_SINGLE(1, "carafe_backward", "group_size", + GEN_CASE_OP_PARAM_SINGLE(1, "carafe", "group_size", carafe_desc->group_size); - GEN_CASE_OP_PARAM_SINGLE(2, "carafe_backward", "scale_factor", + GEN_CASE_OP_PARAM_SINGLE(2, "carafe", "scale_factor", carafe_desc->scale_factor); GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0); } @@ -883,7 +896,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpCarafeBackward( task_dim_x = mluop::runtime::getCoreNumOfEachUnionCapability(handle); task_dim_y = mluop::runtime::getClusterLimitCapability(handle); cnrtDim3_t k_dim = {task_dim_x, task_dim_y, 1}; - cnrtJobType_t k_type = CNRT_FUNC_TYPE_BLOCK; + cnrtFunctionType_t k_type = cnrtFuncTypeBlock; VLOG(5) << "Launch KernelCarafeBackward<<>>"; diff --git a/kernels/carafe/carafe_block.mlu b/kernels/carafe/carafe_block.mlu index b3c3c427b..c84572001 100644 --- a/kernels/carafe/carafe_block.mlu +++ b/kernels/carafe/carafe_block.mlu @@ -27,7 +27,7 @@ #include "kernels/debug.h" #include "kernels/utils/common.h" -__nram__ char nram_buf[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buf[MAX_NRAM_SIZE]; template __mlu_global__ void MLUKernelCarafeForward( diff --git a/kernels/deform_roi_pool/deform_roi_pool.cpp b/kernels/deform_roi_pool/deform_roi_pool.cpp index c8c841fcd..1c1201306 100644 --- a/kernels/deform_roi_pool/deform_roi_pool.cpp +++ b/kernels/deform_roi_pool/deform_roi_pool.cpp @@ -48,7 +48,7 @@ void policyFunc(const mluOpHandle_t handle, k_dim->y = (num_bins / core_limit) > cluster_limit ? cluster_limit : (num_bins / core_limit); k_dim->z = 1; - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; } static mluOpStatus_t DeformRoiPoolForwardPreCheck( diff --git a/kernels/deform_roi_pool/deform_roi_pool_union1.mlu b/kernels/deform_roi_pool/deform_roi_pool_union1.mlu index 6f3ea506d..de67c2516 100644 --- a/kernels/deform_roi_pool/deform_roi_pool_union1.mlu +++ b/kernels/deform_roi_pool/deform_roi_pool_union1.mlu @@ -36,7 +36,7 @@ #define NINESPLIT 9 #define THIRTEENSPLIT 13 -__nram__ char data_nram[MAX_NRAM_SIZE]; +__nram__ int8_t data_nram[MAX_NRAM_SIZE]; template __mlu_func__ bool containNanInf(const T pos1) { diff --git a/kernels/device_check.h b/kernels/device_check.h new file mode 100644 index 000000000..d106ddafa --- /dev/null +++ b/kernels/device_check.h @@ -0,0 +1,35 @@ +/************************************************************************* + * Copyright (C) [2024] by Cambricon, Inc. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#ifndef KERNELS_DEVICE_CHECK_H_ +#define KERNELS_DEVICE_CHECK_H_ + +#ifdef NDEBUG +#ifdef __BANG__ +#include "bang.h" +__mlu_device__ __mlu_builtin__ __attribute__((noinline)) void __assert_fail( + const char *__message, const char *__file, unsigned int __line, + const char *__function); +#endif +#endif + +// The variable names on the device side are usually different from the input +// parameters in the API function, so the logging information needs to be +// written as needed. The detailed information of the "__assert_fail" +// instruction can be found in +// "llvm-project/-/blob/master/docs_bang/design_docs/assert/BANGAssert.md" +#define MLU_KERNEL_ASSERT(cond, message) \ + if (!(cond)) { \ + __assert_fail(message, __FILE__, static_cast(__LINE__), \ + __func__); \ + } + +#endif // KERNELS_DEVICE_CHECK_H_ diff --git a/kernels/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward.cpp b/kernels/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward.cpp index 94016a90c..1f669d49a 100644 --- a/kernels/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward.cpp +++ b/kernels/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward.cpp @@ -34,7 +34,7 @@ static void policyFunc(const mluOpHandle_t handle, const int bn_num, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) { - *k_type = CNRT_FUNC_TYPE_BLOCK; + *k_type = cnrtFuncTypeBlock; uint32_t cluster_num = mluop::runtime::getClusterLimitCapability(handle); uint32_t core_num_per_cluster = mluop::runtime::getCoreNumOfEachUnionCapability(handle); diff --git a/kernels/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward_block.mlu b/kernels/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward_block.mlu index 467c571c1..d9bb2bffa 100644 --- a/kernels/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward_block.mlu +++ b/kernels/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward_block.mlu @@ -27,13 +27,12 @@ #include "kernels/kernel.h" #include "kernels/utils/common.h" -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; -#if __BANG_ARCH__ >= 372 template static __mlu_func__ void load(const T *addr_vertices, const bool *addr_mask, - const int *addr_num_valid, char *nram_vertices, - char *nram_mask, char *nram_num_valid, + const int *addr_num_valid, int8_t *nram_vertices, + int8_t *nram_mask, int8_t *nram_num_valid, const int dim_m, const int deal_num, const int pingpong_offset, const int pi) { int offset = (pi % 2) * pingpong_offset; @@ -112,8 +111,8 @@ static __mlu_func__ bool compare_vertices(T x1, T y1, T diff_1, T x2, T y2, } template -static __mlu_func__ void compute(char *nram_vertices, char *nram_mask, - char *nram_num_valid, char *nram_idx, +static __mlu_func__ void compute(int8_t *nram_vertices, int8_t *nram_mask, + int8_t *nram_num_valid, int8_t *nram_idx, T *nram_pub_space, const int dim_m, const int deal_num, const int pingpong_offset, const int pi) { @@ -204,7 +203,7 @@ static __mlu_func__ void compute(char *nram_vertices, char *nram_mask, } } -static __mlu_func__ void store(int *addr_idx, char *nram_idx, +static __mlu_func__ void store(int *addr_idx, int8_t *nram_idx, const int deal_idx_num, const int pingpong_offset, const int pi) { int offset = (pi % 2) * pingpong_offset; @@ -215,8 +214,8 @@ static __mlu_func__ void store(int *addr_idx, char *nram_idx, template static __mlu_func__ void lcs(const T *base_vertices, const bool *base_mask, const int *base_num_valid, int *base_idx, - char *nram_vertices, char *nram_mask, - char *nram_num_valid, char *nram_idx, + int8_t *nram_vertices, int8_t *nram_mask, + int8_t *nram_num_valid, int8_t *nram_idx, T *nram_pub_space, const int deal_num, const int repeat_n, const int rem_num, const int rem_offset, const int dim_m, @@ -322,13 +321,11 @@ static __mlu_func__ void lcs(const T *base_vertices, const bool *base_mask, repeat_n); } } -#endif template __mlu_global__ void MLUKernelDiffIouRotatedSortVerticesForward( const T *vertices, const bool *mask, const int *num_valid, int *idx, const int dim_b, const int dim_n, const int dim_m) { -#if __BANG_ARCH__ >= 372 if (__is_mpu()) { return; } @@ -364,10 +361,10 @@ __mlu_global__ void MLUKernelDiffIouRotatedSortVerticesForward( T *nram_pub_space = (T *)nram_buffer; // ping/pong - char *nram_vertices = (char *)(nram_pub_space + 3 * deal_num * dim_m); - char *nram_mask = nram_vertices + deal_vertices_num * sizeof(T); - char *nram_num_valid = nram_mask + deal_mask_num * sizeof(bool); - char *nram_idx = nram_num_valid + deal_num_valid_num * sizeof(int); + int8_t *nram_vertices = (int8_t *)(nram_pub_space + 3 * deal_num * dim_m); + int8_t *nram_mask = nram_vertices + deal_vertices_num * sizeof(T); + int8_t *nram_num_valid = nram_mask + deal_mask_num * sizeof(bool); + int8_t *nram_idx = nram_num_valid + deal_num_valid_num * sizeof(int); int repeat_n = total_bn_num / (deal_num * taskDim); int rem_num_device = total_bn_num % (deal_num * taskDim); @@ -384,7 +381,6 @@ __mlu_global__ void MLUKernelDiffIouRotatedSortVerticesForward( lcs(vertices, mask, num_valid, idx, nram_vertices, nram_mask, nram_num_valid, nram_idx, nram_pub_space, deal_num, repeat_n, rem_num, rem_offset, dim_m, pingpong_offset); -#endif } mluOpStatus_t MLUOP_WIN_API KernelDiffIouRotatedSortVerticesForward( diff --git a/kernels/div/div_union1.mlu b/kernels/div/div_union1.mlu index 6951934e7..34de15f1d 100644 --- a/kernels/div/div_union1.mlu +++ b/kernels/div/div_union1.mlu @@ -33,9 +33,9 @@ #define SCALE 1e-5 #define LOW_SCALE 1e10 -__nram__ char nram_buffer[BINARY_NRAM_SIZE]; +__nram__ int8_t nram_buffer[BINARY_NRAM_SIZE]; #if __BANG_ARCH__ != 520 -__mlu_shared__ char sram_buffer[BINARY_SRAM_SIZE]; +__mlu_shared__ int8_t sram_buffer[BINARY_SRAM_SIZE]; #endif template -__mlu_func__ void computeDivFloat(char *nram_z, char *nram_x, char *nram_y, - char *nram_temp1, char *nram_temp2, - char *nram_aux3, int32_t deal_num, - int32_t actual_num) { +__mlu_func__ void computeDivFloat(int8_t *nram_z, int8_t *nram_x, + int8_t *nram_y, int8_t *nram_temp1, + int8_t *nram_temp2, int8_t *nram_aux3, + int32_t deal_num, int32_t actual_num) { #if __BANG_ARCH__ > 500 __bang_div((float *)nram_x, (float *)nram_x, (DType_in2 *)nram_y, actual_num); #else @@ -140,9 +140,9 @@ __mlu_func__ void computeDivFloat(char *nram_z, char *nram_x, char *nram_y, */ template -__mlu_func__ void computeDivHalf(char *nram_z, char *nram_x, char *nram_y, - char *nram_temp1, char *nram_temp2, - char *nram_aux3, int32_t deal_num, +__mlu_func__ void computeDivHalf(int8_t *nram_z, int8_t *nram_x, int8_t *nram_y, + int8_t *nram_temp1, int8_t *nram_temp2, + int8_t *nram_aux3, int32_t deal_num, int32_t actual_num) { #if __BANG_ARCH__ > 500 __bang_div((half *)nram_z, (half *)nram_x, (half *)nram_y, actual_num); @@ -174,14 +174,14 @@ mluOpStatus_t MLUOP_WIN_API Kernel3StagePipelineDiv( /* Only float and half data types are supported in host-side CPP file fool-proof processing. */ case MLUOP_DTYPE_FLOAT: { - KERNEL_CHECK( - MLUBlockKernel3StagePipelineDivFloat - <<>>((char *)x, (char *)y, (char *)z, num)); + KERNEL_CHECK(MLUBlockKernel3StagePipelineDivFloat + <<>>((int8_t *)x, (int8_t *)y, + (int8_t *)z, num)); }; break; case MLUOP_DTYPE_HALF: { - KERNEL_CHECK( - MLUBlockKernel3StagePipelineDivHalf - <<>>((char *)x, (char *)y, (char *)z, num)); + KERNEL_CHECK(MLUBlockKernel3StagePipelineDivHalf + <<>>((int8_t *)x, (int8_t *)y, + (int8_t *)z, num)); }; break; default: break; @@ -197,14 +197,14 @@ mluOpStatus_t MLUOP_WIN_API Kernel5StagePipelineDiv( /* Only float and half data types are supported in host-side CPP file fool-proof processing. */ case MLUOP_DTYPE_FLOAT: { - KERNEL_CHECK( - MLUBlockKernel5StagePipelineDivFloat - <<>>((char *)x, (char *)y, (char *)z, num)); + KERNEL_CHECK(MLUBlockKernel5StagePipelineDivFloat + <<>>((int8_t *)x, (int8_t *)y, + (int8_t *)z, num)); }; break; case MLUOP_DTYPE_HALF: { - KERNEL_CHECK( - MLUBlockKernel5StagePipelineDivHalf - <<>>((char *)x, (char *)y, (char *)z, num)); + KERNEL_CHECK(MLUBlockKernel5StagePipelineDivHalf + <<>>((int8_t *)x, (int8_t *)y, + (int8_t *)z, num)); }; break; default: break; diff --git a/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.cpp b/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.cpp index c5fffd4d4..da589c8ba 100644 --- a/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.cpp +++ b/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.cpp @@ -192,19 +192,19 @@ static void policyFunc(const mluOpHandle_t handle, cnrtDim3_t *k_dim, } else { if (N <= 4) { k_dim->x = core_num * 1; - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; } else if (N <= 8) { k_dim->x = core_num * 2; - *k_type = CNRT_FUNC_TYPE_UNION2; + *k_type = cnrtFuncTypeUnion2; } else if (N <= 16) { k_dim->x = core_num * 4; - *k_type = CNRT_FUNC_TYPE_UNION4; + *k_type = cnrtFuncTypeUnion4; } else if (N <= 32) { k_dim->x = core_num * 8; - *k_type = CNRT_FUNC_TYPE_UNION8; + *k_type = cnrtFuncTypeUnion8; } else if (N <= 64) { k_dim->x = core_num * 16; - *k_type = CNRT_FUNC_TYPE_UNION16; + *k_type = cnrtFuncTypeUnion16; } else { LOG(ERROR) << "[mluOpDynamicPointToVoxelBackward]: failed to choose kernel " @@ -368,6 +368,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpGetDynamicPointToVoxelBackwardWorkspaceSize( << "Please check the device version!"; return MLUOP_STATUS_ARCH_MISMATCH; } + PARAM_CHECK(interface_name, grad_voxel_feats_desc != NULL); PARAM_CHECK(interface_name, feats_desc != NULL); PARAM_CHECK(interface_name, voxel_feats_desc != NULL); diff --git a/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward_union1.mlu b/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward_union1.mlu index 9e7d137b9..c114aca07 100644 --- a/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward_union1.mlu +++ b/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward_union1.mlu @@ -27,7 +27,7 @@ #include "kernels/kernel.h" #include "kernels/utils/common.h" -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; template __mlu_func__ void loadAsync(T *feats_nram, T *voxel_feats_nram, @@ -168,7 +168,7 @@ __mlu_global__ void MLUKernelMaxReduceTracebackScatterIdx( // broadcast point2voxel_map to nram __memcpy(point2voxel_map_nram, point2voxel_map, size_input, GDRAM2NRAM); // initialze voxel_from_flag to false - __memset_nram(voxel_from_flag_nram, M, (char)false); + __memset_nram(voxel_from_flag_nram, M, (int8_t) false); for (int i = 0; i < C; i++) { index_col_nram[i] = i; } diff --git a/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.cpp b/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.cpp index 0d856cc68..4c8fcc9ef 100644 --- a/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.cpp +++ b/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.cpp @@ -45,22 +45,22 @@ static void policyFuncDynamicPointToVoxelForward(const mluOpHandle_t handle, } else { if (nums == 1) { k_dim->x = 1; - *k_type = CNRT_FUNC_TYPE_BLOCK; + *k_type = cnrtFuncTypeBlock; } else if (nums <= 4) { k_dim->x = core_num * 1; - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; } else if (nums <= 8) { k_dim->x = core_num * 2; - *k_type = CNRT_FUNC_TYPE_UNION2; + *k_type = cnrtFuncTypeUnion2; } else if (nums <= 16) { k_dim->x = core_num * 4; - *k_type = CNRT_FUNC_TYPE_UNION4; + *k_type = cnrtFuncTypeUnion4; } else if (nums <= 32) { k_dim->x = core_num * 8; - *k_type = CNRT_FUNC_TYPE_UNION8; + *k_type = cnrtFuncTypeUnion8; } else if (nums <= 64) { k_dim->x = core_num * 16; - *k_type = CNRT_FUNC_TYPE_UNION16; + *k_type = cnrtFuncTypeUnion16; } } k_dim->y = 1; diff --git a/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward_union1.mlu b/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward_union1.mlu index b0c7d8711..98a49ca73 100644 --- a/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward_union1.mlu +++ b/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward_union1.mlu @@ -27,7 +27,7 @@ #include "kernels/kernel.h" #include "kernels/utils/common.h" -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; #define COORS_IDX 1 #define COORS_XYZ 3 @@ -45,7 +45,7 @@ __mlu_func__ void compute(float *nram_input, int *nram_points_count, int offset = (pi % 2) * 2 * deal_num; float *nram_input_p = nram_input + offset; float *nram_output_p = nram_input + offset + deal_num; -#if (__BANG_ARCH__ >= 322) && (__BANG_ARCH__ != 372) +#if __BANG_ARCH__ != 372 __bang_div(nram_output_p, nram_input_p, (float)(nram_points_count[pi]), deal_num); #else @@ -128,7 +128,6 @@ __mlu_global__ void MLUKernelDynamicPointToVoxelForward( mluOpReduceMode_t reduce_mode, const float *feats, int32_t num_points, int32_t num_feats, int32_t *voxel_coors, int32_t *voxel_num, int *point2voxel_map, int32_t *voxel_points_count, float *voxel_feats) { -#if __BANG_ARCH__ >= 372 if (__is_mpu()) { return; } @@ -322,7 +321,6 @@ __mlu_global__ void MLUKernelDynamicPointToVoxelForward( } } } -#endif } mluOpStatus_t MLUOP_WIN_API KernelDynamicPointToVoxelForward( diff --git a/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_mask_block.mlu b/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_mask_block.mlu index 63f350b47..67f67bc05 100644 --- a/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_mask_block.mlu +++ b/kernels/dynamic_point_to_voxel/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_mask_block.mlu @@ -29,7 +29,7 @@ #define COORS_XYZ 3 -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; __mlu_func__ void load(const int32_t *input_addr, int32_t *nram_input, const int32_t pingpong, const int32_t deal_num, @@ -81,7 +81,6 @@ __mlu_func__ void store(int32_t *output_addr, int32_t *nram_output, __mlu_global__ void MLUKernelMaskFillCoorsForward(int32_t num_points, int32_t *coors) { -#if __BANG_ARCH__ >= 372 if (__is_mpu()) { return; } @@ -162,8 +161,6 @@ __mlu_global__ void MLUKernelMaskFillCoorsForward(int32_t num_points, int32_t *output_addr = base_coors + repeat_n * deal_num; store(output_addr, coors_ping_in, pingpong, rem_num, repeat_n); } - -#endif } mluOpStatus_t MLUOP_WIN_API diff --git a/kernels/fft/c2c_fft/c2c_fft_host.cpp b/kernels/fft/c2c_fft/c2c_fft_host.cpp index d855c554f..29c53d61f 100644 --- a/kernels/fft/c2c_fft/c2c_fft_host.cpp +++ b/kernels/fft/c2c_fft/c2c_fft_host.cpp @@ -527,7 +527,7 @@ mluOpStatus_t setFFT1dReserveArea(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan, mluop::runtime::getClusterLimitCapability(handle); const unsigned int core_dim = handle->core_num_per_cluster; cnrtDim3_t k_dim = {core_dim, cluster_number, 1}; - cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_BLOCK; + cnrtFunctionType_t k_type = cnrtFuncTypeBlock; switch (fft_plan->fft_strategy) { case CNFFT_FUNC_MATMUL: { @@ -1242,13 +1242,13 @@ static mluOpStatus_t padFFT1dContiguousInput(mluOpHandle_t handle, DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc); DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(padded_input_desc, cnnl_padded_input_desc); - CALL_CNNL(cnnlPad( - cnnl_handle, cnnl_input_desc, - fft_plan->prime ? fft_plan->matmul_addrs.input_contiguous_addr - : fft_plan->mlu_addrs.input, - paddings, &padding_value, cnnl_padded_input_desc, - fft_plan->prime ? fft_plan->matmul_addrs.input_pad_addr - : fft_plan->mlu_addrs.input_pad_addr)); + CALL_CNNL(cnnlPad(cnnl_handle, cnnl_input_desc, + fft_plan->prime + ? fft_plan->matmul_addrs.input_contiguous_addr + : fft_plan->mlu_addrs.input, + paddings, &padding_value, cnnl_padded_input_desc, + fft_plan->prime ? fft_plan->matmul_addrs.input_pad_addr + : fft_plan->mlu_addrs.input_pad_addr)); // destroy cnnl descriptor VLOG(5) << "c2cfft cnnlOpPad end"; @@ -1644,7 +1644,7 @@ static mluOpStatus_t computeFFT1dMatmulResult(mluOpHandle_t handle, static mluOpStatus_t policyFunc(mluOpHandle_t handle, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) { - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; k_dim->x = handle->core_num_per_cluster; k_dim->y = mluop::runtime::getClusterLimitCapability(handle); k_dim->z = 1; @@ -1664,7 +1664,7 @@ mluOpStatus_t mergeFFT1dOutput(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan, VLOG(5) << "launch merge fft1d output"; // TODO(niyuming) luanch merge kernel int core_num = handle->core_num_per_cluster; - cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1; + cnrtFunctionType_t k_type = cnrtFuncTypeUnion1; int task_type = mluop::runtime::getJobLimitCapability(handle); int task_num = 1; @@ -1672,16 +1672,16 @@ mluOpStatus_t mergeFFT1dOutput(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan, default: task_num = core_num; break; - case (int)CNRT_FUNC_TYPE_UNION2: + case (int)cnrtFuncTypeUnion2: task_num = core_num * 2; break; - case (int)CNRT_FUNC_TYPE_UNION4: + case (int)cnrtFuncTypeUnion4: task_num = core_num * 4; break; - case (int)CNRT_FUNC_TYPE_UNION8: + case (int)cnrtFuncTypeUnion8: task_num = core_num * 8; break; - case (int)CNRT_FUNC_TYPE_UNION16: + case (int)cnrtFuncTypeUnion16: task_num = core_num * 16; break; } diff --git a/kernels/fft/common/fft_basic_ops.cpp b/kernels/fft/common/fft_basic_ops.cpp index 24cb115ac..fd56557d3 100644 --- a/kernels/fft/common/fft_basic_ops.cpp +++ b/kernels/fft/common/fft_basic_ops.cpp @@ -378,9 +378,6 @@ mluOpStatus_t fftQuantMatMul(mluOpHandle_t handle, int m, int k, int n, &trans_b_int, sizeof(int32_t))); CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_ALLOW_TF32, &allow_tf32, sizeof(int32_t))); - CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_COMPUTE_TYPE, - &cnnl_compute_type, - sizeof(cnnl_compute_type))); CALL_CNNL(cnnlMatMulAlgoCreate(&matmul_algo)); CALL_CNNL(cnnlCreateMatMulHeuristicResult(&heuristic_result)); int32_t requested_algo_count = 1, return_algo_count = 0; @@ -399,6 +396,7 @@ mluOpStatus_t fftQuantMatMul(mluOpHandle_t handle, int m, int k, int n, cnnl_a_desc, a_ptr, cnnl_b_desc, b_ptr, &beta, cnnl_c_desc, c_ptr, workspace, workspace_size, cnnl_d_desc, c_ptr)); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_d_desc); } // destroy cnnl descriptor diff --git a/kernels/fft/common/fft_common_kernels.mlu b/kernels/fft/common/fft_common_kernels.mlu index 164ea745c..8cca3a697 100644 --- a/kernels/fft/common/fft_common_kernels.mlu +++ b/kernels/fft/common/fft_common_kernels.mlu @@ -30,7 +30,7 @@ #define PAD_SIZE 64 -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; /* convert function @@ -78,36 +78,26 @@ __mlu_func__ void __mluop_mod(T *src_addr, T *temp_addr, T n, int len) { */ __mlu_func__ void genSinCosVec(float *src_addr, float *sin_addr, float *cos_addr, int deal_size) { -#if __BANG_ARCH__ >= 372 __bang_sin(sin_addr, src_addr, deal_size); __bang_cos(cos_addr, src_addr, deal_size); -#else - for (int i = 0; i < deal_size; i++) { - sin_addr[i] = sinf(src_addr[i]); - cos_addr[i] = cosf(src_addr[i]); - } -#endif } /* generate select offset vector function: bang_arch >= 372, for gather inst offset, should mul sizeof(float) - bang_arch < 372, for scalar select offset [input] offset_addr: offset data in float32 [input] deal_size: offset data number(don't need align) [output] offset_int_addr: offset data in int32 */ __mlu_func__ void genSelectOffsetVec(float *offset_addr, int32_t *offset_int_addr, int deal_size) { - for (int i = 0; i < deal_size; i++) { - offset_int_addr[i] = (int)(offset_addr[i]); - } + __bang_mul_scalar(offset_addr, offset_addr, (float)sizeof(float), deal_size); + __bang_float2int32((int32_t *)offset_int_addr, offset_addr, deal_size, 0); } /* select data function: bang_arch >= 372, use gather inst, offset should mul sizeof(float) - bang_arch < 372, use for and scalar select inst [input] src_addr: input data to be selected [input] offset_int_addr: offset data to select data in int32 [input] deal_size: offset data number(don't need align) @@ -115,9 +105,16 @@ __mlu_func__ void genSelectOffsetVec(float *offset_addr, */ __mlu_func__ void selectVec(float *src_addr, int32_t *offset_int_addr, float *dst_addr, int deal_size) { +#if __BANG_ARCH__ >= 372 && __BANG_ARCH__ != 520 + __asm__ volatile( + "gather.clean.nram.nram.nram.b32.u32 " + "[%[dst]], [%[src]], [%[offset]], %[data_num];\n\t" ::[dst] "r"(dst_addr), + [src] "r"(src_addr), [offset] "r"(offset_int_addr), [data_num] "r"(deal_size)); +#else for (auto i = 0; i < deal_size; i++) { dst_addr[i] = src_addr[offset_int_addr[i]]; } +#endif } /* @@ -152,7 +149,7 @@ __mlu_func__ void generateRFFTHalfDFTMatrixImpl(int n, void *output) { float *row_addr = temp_addr; // generate 0 to n indices - __mluop_get_indices(inc_addr, (float)0.0, deal_size); + __mlu_op_gen_stage_index(inc_addr, deal_size, 0.0f, 1.0f); // generate sin and cos vectors const float scale = -2.0 * M_PI / n; @@ -236,7 +233,7 @@ __mlu_func__ void generateRFFTFullDFTMatrixImpl(int row, int n, void *output) { float *row_addr = temp_addr; // generate 0 to n indices - __mluop_get_indices(inc_addr, (float)0.0, deal_size); + __mlu_op_gen_stage_index(inc_addr, deal_size, 0.0f, 1.0f); // generate sin and cos vectors const float scale = -2.0 * M_PI / n; @@ -325,7 +322,7 @@ __mlu_func__ void generateIRFFTHalfDFTMatrixImpl(int n, void *output) { float *row_addr = temp_addr; // generate 0 to n indices - __mluop_get_indices(inc_addr, (float)0.0, deal_size); + __mlu_op_gen_stage_index(inc_addr, deal_size, 0.0f, 1.0f); // generate sin and cos coefficient vectors __bang_write_value((float *)cos_coeff_addr, deal_size, (float)2.0); @@ -420,7 +417,7 @@ __mlu_func__ void generateIRFFTFullDFTMatrixImpl(int n, void *output) { float *row_addr = temp_addr; // generate 0 to n indices - __mluop_get_indices(inc_addr, (float)0.0, deal_size); + __mlu_op_gen_stage_index(inc_addr, deal_size, 0.0f, 1.0f); // generate sin and cos vectors const float scale = 2.0 * M_PI / n; @@ -516,7 +513,7 @@ __mlu_func__ void generateC2CFFTDFTMatrixImpl(int n, void *output) { float *row_addr = temp_addr; // generate 0 to n indices - __mluop_get_indices(inc_addr, (float)0.0, deal_size); + __mlu_op_gen_stage_index(inc_addr, deal_size, 0.0f, 1.0f); // generate sin and cos vectors const float forward_scale = -2.0 * M_PI / n; diff --git a/kernels/fft/fft_optm_device/fft_c2c_stockham_gdram.h b/kernels/fft/fft_optm_device/fft_c2c_stockham_gdram.h index 070648d32..e0374cdf9 100644 --- a/kernels/fft/fft_optm_device/fft_c2c_stockham_gdram.h +++ b/kernels/fft/fft_optm_device/fft_c2c_stockham_gdram.h @@ -24,8 +24,8 @@ #include "kernels/fft/fft_optm_device/fft_c2c_stockham_nram.h" #include "kernels/fft/fft_optm_device/fft_sram_allocate.h" -extern __nram__ char nram_buffer[MAX_NRAM_SIZE + REM_FOR_STACK - 32 * 1024]; -extern __wram__ char wram_buffer[MAX_WRAM_SIZE]; +extern __nram__ int8_t nram_buffer[MAX_NRAM_SIZE + REM_FOR_STACK - 32 * 1024]; +extern __wram__ int8_t wram_buffer[MAX_WRAM_SIZE]; // Perform C2C in-place network allocation for the row slice on the chip template @@ -38,7 +38,7 @@ __mlu_func__ void computeMutiStageOnchip(DT *input, DT *output, int *factors, int repeat_num = total_num / taskDim; int remain_num = total_num % taskDim; - char *nram_buf = nram_buffer + FFT_MAXFACTORS * sizeof(int); + int8_t *nram_buf = nram_buffer + FFT_MAXFACTORS * sizeof(int); int *nram_factors = (int *)nram_buffer; int t_len = repeat_num + ((remain_num > 0 && taskId < remain_num) ? 1 : 0); @@ -217,7 +217,7 @@ __mlu_func__ void computeMutiStageOnchipColumn( int repeat_num = total_num / taskDim; int remain_num = total_num % taskDim; - char *nram_buf = nram_buffer + FFT_MAXFACTORS * sizeof(int); + int8_t *nram_buf = nram_buffer + FFT_MAXFACTORS * sizeof(int); int *nram_factors = (int *)nram_buffer; int t_len = repeat_num + ((remain_num > 0 && taskId < remain_num) ? 1 : 0); diff --git a/kernels/fft/fft_optm_device/fft_c2r_stockham_gdram.h b/kernels/fft/fft_optm_device/fft_c2r_stockham_gdram.h index 8fa35fe9f..eb18b718e 100644 --- a/kernels/fft/fft_optm_device/fft_c2r_stockham_gdram.h +++ b/kernels/fft/fft_optm_device/fft_c2r_stockham_gdram.h @@ -24,8 +24,8 @@ #include "kernels/fft/fft_optm_device/fft_c2r_stockham_nram.h" #include "kernels/fft/fft_optm_device/fft_sram_allocate.h" -extern __nram__ char nram_buffer[MAX_NRAM_SIZE + REM_FOR_STACK - 32 * 1024]; -extern __wram__ char wram_buffer[MAX_WRAM_SIZE]; +extern __nram__ int8_t nram_buffer[MAX_NRAM_SIZE + REM_FOR_STACK - 32 * 1024]; +extern __wram__ int8_t wram_buffer[MAX_WRAM_SIZE]; // Compute multi-stage on-chip FFT from complex to real (C2R) template @@ -36,7 +36,7 @@ __mlu_func__ void computeMutiStageOnchipC2R( int repeat_num = total_num / taskDim; int remain_num = total_num % taskDim; - char *nram_buf = nram_buffer + FFT_MAXFACTORS * sizeof(int); + int8_t *nram_buf = nram_buffer + FFT_MAXFACTORS * sizeof(int); int *nram_factors = (int *)nram_buffer; // Each core needs to process "t_len" blocks, "remain_num" is evenly diff --git a/kernels/fft/fft_optm_device/fft_cooley-tukey_ux_device.mlu b/kernels/fft/fft_optm_device/fft_cooley-tukey_ux_device.mlu index b5feb351a..b7967af4b 100644 --- a/kernels/fft/fft_optm_device/fft_cooley-tukey_ux_device.mlu +++ b/kernels/fft/fft_optm_device/fft_cooley-tukey_ux_device.mlu @@ -63,15 +63,8 @@ __mlu_func__ void genWVec1(float *w_r, float *w_i, float *w_tmp1, float *w_tmp2, 2.0 * M_PI / (n / L_align * L * 2.0) * (fft_flag != IRFFT ? -1 : 1); scale *= ((fft_flag == FFT_IFFT && direction == FFT_INVERSE) ? -1 : 1); __bang_mul_scalar(tmp_offset_addr, tmp_inc_addr, scale, L_align); -#if __BANG_ARCH__ >= 372 __bang_cos(tmp_cos_addr, tmp_offset_addr, L_align); __bang_sin(tmp_sin_addr, tmp_offset_addr, L_align); -#else - for (int i = 0; i < L_align; i++) { - tmp_cos_addr[i] = cosf(tmp_offset_addr[i]); - tmp_sin_addr[i] = sinf(tmp_offset_addr[i]); - } -#endif } } @@ -270,7 +263,6 @@ __mlu_func__ void transAndStore(DT *x_out_trans, DT *y_in_r, DT *y_in_i, int bc_offset = bc * n * COMPLEX_FACTOR; int dst_offset = (ro * repeat_inner_basic_group + ri) * basic_size * 2 + bc_offset; -#if __BANG_ARCH__ >= 372 __bang_transpose(x_out_trans, y_in_r, COMPLEX_FACTOR, basic_size_align_via_L); __memcpy((DT *)output + dst_offset, x_out_trans, @@ -281,28 +273,6 @@ __mlu_func__ void transAndStore(DT *x_out_trans, DT *y_in_r, DT *y_in_i, __memcpy((DT *)output + dst_offset + n, x_out_trans, basic_size_bytes * COMPLEX_FACTOR, NRAM2GDRAM); } -#else - // For efficiency and space, we split one transpose [2, N] --> [2, x, N/x], - // into two as follows: - // trans1: [(2, x), N/x] -> [N/x, (2, x)] - // trans2: [(N/x, 2), x)] -> [x, (N/x, 2)] - // Note: both dim2 and dim3 need to meet alignment requirement: - // TRANS_ALIGN_SIZE / (int)sizeof(DT) - int dim1 = COMPLEX_FACTOR; - int dim2 = TRANS_ALIGN_SIZE / (int)sizeof(DT); - int dim3 = basic_size_align_via_L / dim2; - DT *x_out_trans_tmp = x_out_trans + dim1 * dim2 * dim3; - __bang_transpose(x_out_trans_tmp, y_in_r, dim1 * dim2, dim3); - __bang_transpose(x_out_trans, x_out_trans_tmp, dim3 * dim1, dim2); - __memcpy((DT *)output + dst_offset, x_out_trans, - basic_size_bytes * COMPLEX_FACTOR, NRAM2GDRAM); - if (stage == ITER_ONCHIP) { - __bang_transpose(x_out_trans_tmp, y_in_i, dim1 * dim2, dim3); - __bang_transpose(x_out_trans, x_out_trans_tmp, dim3 * dim1, dim2); - __memcpy((DT *)output + dst_offset + n, x_out_trans, - basic_size_bytes * COMPLEX_FACTOR, NRAM2GDRAM); - } -#endif } else if (fft_flag == RFFT) { int bc_offset = bc * (n / 2 + 1) * COMPLEX_FACTOR; int dst_offset = (ro * repeat_inner_basic_group + ri) * basic_size; @@ -312,18 +282,8 @@ __mlu_func__ void transAndStore(DT *x_out_trans, DT *y_in_r, DT *y_in_i, *((DT *)output + bc_offset + (n + 1)) = *((DT *)y_in_i + basic_size / YZ_FACTOR); } -#if __BANG_ARCH__ >= 372 __bang_transpose(x_out_trans, y_in_r, COMPLEX_FACTOR, basic_size_align_via_L); -#else - // the principle of transpose is the same as FFT_IFFT - int dim1 = COMPLEX_FACTOR; - int dim2 = TRANS_ALIGN_SIZE / (int)sizeof(DT); - int dim3 = basic_size_align_via_L / dim2; - DT *x_out_trans_tmp = x_out_trans + dim1 * dim2 * dim3; - __bang_transpose(x_out_trans_tmp, y_in_r, dim1 * dim2, dim3); - __bang_transpose(x_out_trans, x_out_trans_tmp, dim3 * dim1, dim2); -#endif if (stage == ITER_OFFCHIP) { __memcpy((DT *)output + dst_offset + bc_offset, x_out_trans, basic_size_bytes, NRAM2GDRAM); diff --git a/kernels/fft/fft_optm_device/fft_generic_butterfly.h b/kernels/fft/fft_optm_device/fft_generic_butterfly.h index 9a9eacc10..f0a87216f 100644 --- a/kernels/fft/fft_optm_device/fft_generic_butterfly.h +++ b/kernels/fft/fft_optm_device/fft_generic_butterfly.h @@ -23,7 +23,7 @@ #pragma once #include "kernels/fft/fft_optm_device/fft_butterfly_ops.h" -extern __wram__ char wram_buffer[MAX_WRAM_SIZE]; +extern __wram__ int8_t wram_buffer[MAX_WRAM_SIZE]; template __mlu_func__ void computeGenericButterflyFirststageMat_v1( diff --git a/kernels/fft/fft_optm_device/fft_nram_wram_allocate.h b/kernels/fft/fft_optm_device/fft_nram_wram_allocate.h index b29f358a9..f8d7978ae 100644 --- a/kernels/fft/fft_optm_device/fft_nram_wram_allocate.h +++ b/kernels/fft/fft_optm_device/fft_nram_wram_allocate.h @@ -23,5 +23,5 @@ #pragma once -__nram__ char nram_buffer[MAX_NRAM_SIZE + REM_FOR_STACK - 32 * 1024]; -__wram__ char wram_buffer[MAX_WRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE + REM_FOR_STACK - 32 * 1024]; +__wram__ int8_t wram_buffer[MAX_WRAM_SIZE]; diff --git a/kernels/fft/fft_optm_device/fft_r2c_stockham_gdram.h b/kernels/fft/fft_optm_device/fft_r2c_stockham_gdram.h index aa0201a6c..4d9c27440 100644 --- a/kernels/fft/fft_optm_device/fft_r2c_stockham_gdram.h +++ b/kernels/fft/fft_optm_device/fft_r2c_stockham_gdram.h @@ -24,8 +24,8 @@ #include "kernels/fft/fft_optm_device/fft_r2c_stockham_nram.h" #include "kernels/fft/fft_optm_device/fft_sram_allocate.h" -extern __nram__ char nram_buffer[MAX_NRAM_SIZE + REM_FOR_STACK - 32 * 1024]; -extern __wram__ char wram_buffer[MAX_WRAM_SIZE]; +extern __nram__ int8_t nram_buffer[MAX_NRAM_SIZE + REM_FOR_STACK - 32 * 1024]; +extern __wram__ int8_t wram_buffer[MAX_WRAM_SIZE]; // Compute multi-stage FFT from real to complex (R2C) on-chip template @@ -39,7 +39,7 @@ __mlu_func__ void computeMutiStageR2COnchip(DT *input, DT *output, int *factors, int repeat_num = total_num / taskDim; int remain_num = total_num % taskDim; - char *nram_buf = nram_buffer + FFT_MAXFACTORS * sizeof(int); + int8_t *nram_buf = nram_buffer + FFT_MAXFACTORS * sizeof(int); int *nram_factors = (int *)nram_buffer; int t_len = repeat_num + ((remain_num > 0 && taskId < remain_num) ? 1 : 0); diff --git a/kernels/fft/fft_optm_device/fft_sram_allocate.h b/kernels/fft/fft_optm_device/fft_sram_allocate.h index c4690ab82..a59955826 100644 --- a/kernels/fft/fft_optm_device/fft_sram_allocate.h +++ b/kernels/fft/fft_optm_device/fft_sram_allocate.h @@ -23,4 +23,4 @@ #pragma once -__mlu_shared__ char sram_buffer[MAX_SRAM_SIZE]; +__mlu_shared__ int8_t sram_buffer[MAX_SRAM_SIZE]; diff --git a/kernels/fft/fft_optm_device/fft_stockham_u1_device.mlu b/kernels/fft/fft_optm_device/fft_stockham_u1_device.mlu index 3651d1807..99ef014cd 100644 --- a/kernels/fft/fft_optm_device/fft_stockham_u1_device.mlu +++ b/kernels/fft/fft_optm_device/fft_stockham_u1_device.mlu @@ -45,7 +45,6 @@ __mlu_func__ void genWSc1_opt(DT* w_r, DT* w_i, DT* tmp, DT* seq_addr, __bang_add_scalar(tmp, seq_addr, inc_value, size_tmp_bytes); __bang_mul_scalar(tmp, tmp, scale, size_tmp_bytes); -#if __BANG_ARCH__ >= 372 __bang_cos((float*)w_r, (float*)tmp, size_tmp_bytes); if (n <= 48000) { __bang_sin((float*)w_i, (float*)tmp, size_tmp_bytes); @@ -53,7 +52,6 @@ __mlu_func__ void genWSc1_opt(DT* w_r, DT* w_i, DT* tmp, DT* seq_addr, // This function has higher precision, and the actual test determined n. __cn_vector_sin_f32(size_tmp_bytes, (float*)w_i, (float*)tmp); } -#endif } // Load input data from GDRAM to NRAM. The data source(src_in) is the @@ -126,7 +124,7 @@ __mlu_func__ void load(DT* y_in_r, DT* y_in_i, DT* z_in_r, DT* z_in_i, int src_offset = L_sub * pow_2_m * part + b * n * 2; int data_size_bytes = pow_2_m * sizeof(DT); int total_data_size_bytes = L_deal * data_size_bytes; - int distance_bytes = int((char*)x_out1_i - (char*)x_out1_r); + int distance_bytes = int((int8_t*)x_out1_i - (int8_t*)x_out1_r); if (part < part_num / 2 || part_num == 1) { __memcpy_async(x_out1_r, matmul_re_mul_re_addr + src_offset, total_data_size_bytes, GDRAM2NRAM, distance_bytes, diff --git a/kernels/fft/irfft/irfft_host.cpp b/kernels/fft/irfft/irfft_host.cpp index df5b8fa1b..b065028e1 100644 --- a/kernels/fft/irfft/irfft_host.cpp +++ b/kernels/fft/irfft/irfft_host.cpp @@ -515,7 +515,7 @@ mluOpStatus_t setIRFFT1dReserveArea(mluOpHandle_t handle, mluop::runtime::getClusterLimitCapability(handle); const unsigned int core_dim = handle->core_num_per_cluster; cnrtDim3_t k_dim = {core_dim, cluster_number, 1}; - cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_BLOCK; + cnrtFunctionType_t k_type = cnrtFuncTypeBlock; switch (fft_plan->fft_strategy) { case CNFFT_FUNC_MATMUL: { @@ -1385,7 +1385,7 @@ static mluOpStatus_t computeIRFFT1dMatmulResult(mluOpHandle_t handle, static mluOpStatus_t policyFunc(mluOpHandle_t handle, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) { - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; k_dim->x = handle->core_num_per_cluster; k_dim->y = mluop::runtime::getClusterLimitCapability(handle); k_dim->z = 1; @@ -1404,7 +1404,7 @@ mluOpStatus_t mergeIRFFT1dOutput(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan, VLOG(5) << "launch merge irfft1d output"; if (fft_plan->fft_strategy == CNFFT_FUNC_COOLEY_TUKEY) { int core_num = handle->core_num_per_cluster; - cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1; + cnrtFunctionType_t k_type = cnrtFuncTypeUnion1; int task_type = mluop::runtime::getJobLimitCapability(handle); int task_num = 1; @@ -1412,16 +1412,16 @@ mluOpStatus_t mergeIRFFT1dOutput(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan, default: task_num = core_num; break; - case (int)CNRT_FUNC_TYPE_UNION2: + case (int)cnrtFuncTypeUnion2: task_num = core_num * 2; break; - case (int)CNRT_FUNC_TYPE_UNION4: + case (int)cnrtFuncTypeUnion4: task_num = core_num * 4; break; - case (int)CNRT_FUNC_TYPE_UNION8: + case (int)cnrtFuncTypeUnion8: task_num = core_num * 8; break; - case (int)CNRT_FUNC_TYPE_UNION16: + case (int)cnrtFuncTypeUnion16: task_num = core_num * 16; break; } @@ -1551,7 +1551,7 @@ mluOpStatus_t execIRFFT1d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan, fft_plan->mlu_addrs.input = fft_plan->mlu_addrs.input_pad_addr; } - cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1; + cnrtFunctionType_t k_type = cnrtFuncTypeUnion1; cnrtDim3_t k_dim; k_dim.x = handle->core_num_per_cluster; k_dim.y = mluop::runtime::getClusterLimitCapability(handle); diff --git a/kernels/fft/rfft/rfft_host.cpp b/kernels/fft/rfft/rfft_host.cpp index 1bcbbfcb8..d0755e8be 100644 --- a/kernels/fft/rfft/rfft_host.cpp +++ b/kernels/fft/rfft/rfft_host.cpp @@ -490,7 +490,7 @@ mluOpStatus_t setRFFT1dReserveArea(mluOpHandle_t handle, mluop::runtime::getClusterLimitCapability(handle); const unsigned int core_dim = handle->core_num_per_cluster; cnrtDim3_t k_dim = {core_dim, cluster_number, 1}; - cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_BLOCK; + cnrtFunctionType_t k_type = cnrtFuncTypeBlock; switch (fft_plan->fft_strategy) { case CNFFT_FUNC_MATMUL: { @@ -1007,7 +1007,7 @@ static mluOpStatus_t computeRFFT1dMatmulResult(mluOpHandle_t handle, static mluOpStatus_t policyFunc(mluOpHandle_t handle, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) { - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; k_dim->x = handle->core_num_per_cluster; k_dim->y = mluop::runtime::getClusterLimitCapability(handle); k_dim->z = 1; @@ -1026,7 +1026,7 @@ static mluOpStatus_t mergeRFFT1dOutput(mluOpHandle_t handle, if (fft_plan->fft_strategy == CNFFT_FUNC_COOLEY_TUKEY) { VLOG(5) << "launch merge rfft1d output"; int core_num = handle->core_num_per_cluster; - cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1; + cnrtFunctionType_t k_type = cnrtFuncTypeUnion1; int task_type = mluop::runtime::getJobLimitCapability(handle); int task_num = 1; @@ -1034,16 +1034,16 @@ static mluOpStatus_t mergeRFFT1dOutput(mluOpHandle_t handle, default: task_num = core_num; break; - case (int)CNRT_FUNC_TYPE_UNION2: + case (int)cnrtFuncTypeUnion2: task_num = core_num * 2; break; - case (int)CNRT_FUNC_TYPE_UNION4: + case (int)cnrtFuncTypeUnion4: task_num = core_num * 4; break; - case (int)CNRT_FUNC_TYPE_UNION8: + case (int)cnrtFuncTypeUnion8: task_num = core_num * 8; break; - case (int)CNRT_FUNC_TYPE_UNION16: + case (int)cnrtFuncTypeUnion16: task_num = core_num * 16; break; } @@ -1272,7 +1272,7 @@ mluOpStatus_t execRFFT1d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan, fft_plan->mlu_addrs.input = fft_plan->mlu_addrs.input_pad_addr; } - cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1; + cnrtFunctionType_t k_type = cnrtFuncTypeUnion1; cnrtDim3_t k_dim; k_dim.x = handle->core_num_per_cluster; k_dim.y = mluop::runtime::getClusterLimitCapability(handle); diff --git a/kernels/focal_loss_sigmoid/focal_loss_sigmoid.cpp b/kernels/focal_loss_sigmoid/focal_loss_sigmoid.cpp index cdbf56dde..cb014c8fb 100644 --- a/kernels/focal_loss_sigmoid/focal_loss_sigmoid.cpp +++ b/kernels/focal_loss_sigmoid/focal_loss_sigmoid.cpp @@ -35,7 +35,7 @@ static void policyFunc(const mluOpHandle_t handle, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) { - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; k_dim->x = handle->core_num_per_cluster; k_dim->y = mluop::runtime::getClusterLimitCapability(handle); k_dim->z = 1; diff --git a/kernels/focal_loss_sigmoid/focal_loss_sigmoid_backward_union1.mlu b/kernels/focal_loss_sigmoid/focal_loss_sigmoid_backward_union1.mlu index d84742797..5ef3f1218 100644 --- a/kernels/focal_loss_sigmoid/focal_loss_sigmoid_backward_union1.mlu +++ b/kernels/focal_loss_sigmoid/focal_loss_sigmoid_backward_union1.mlu @@ -31,7 +31,7 @@ #define PING 0 #define PONG 1 -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; namespace backward { @@ -54,19 +54,15 @@ __mlu_func__ void sigmoid(T *dst_data, const T *src_data, __mlu_func__ void computeLogE(float *nram_dst, float *nram_src, const int32_t deal_num) { -#if __BANG_ARCH__ >= 372 int x2d = 0x3f317217; float rlog2e = *(float *)&x2d; - __bang_log((float *)nram_dst, (float *)nram_src, deal_num); + __bang_log2((float *)nram_dst, (float *)nram_src, deal_num); __bang_mul_scalar((float *)nram_dst, (float *)nram_src, (float)rlog2e, deal_num); -#else - __bang_active_loghp((float *)nram_dst, (float *)nram_src, deal_num); -#endif } template -__mlu_func__ void loadInputBwd(char *nram_input, char *nram_target, +__mlu_func__ void loadInputBwd(int8_t *nram_input, int8_t *nram_target, const T *gdram_input, const int32_t *gdram_target, const int32_t deal_n, const int32_t total_c, @@ -81,7 +77,7 @@ __mlu_func__ void loadInputBwd(char *nram_input, char *nram_target, __memcpy_async(nram_target, gdram_target + gdram_offset / total_c, deal_n * sizeof(int32_t), GDRAM2NRAM); - char *nram_input_load = nram_input; + int8_t *nram_input_load = nram_input; int32_t compute_align_size = 2 * NFU_ALIGN_SIZE; if (has_weight) { if (sizeof(T) == sizeof(half)) { @@ -108,7 +104,7 @@ __mlu_func__ void loadInputBwd(char *nram_input, char *nram_target, } template -__mlu_func__ void storeOutputBwd(T *gdram_output, const char *nram_output, +__mlu_func__ void storeOutputBwd(T *gdram_output, const int8_t *nram_output, const int32_t deal_n, const int32_t total_c, const bool pingpong_flag, const bool has_weight, @@ -131,11 +127,11 @@ __mlu_func__ void storeOutputBwd(T *gdram_output, const char *nram_output, } template -__mlu_func__ void coreCompute(char *nram_input, const T *nram_weight, - const float *nram_flt_min, char *nram_pt, - char *nram_alpha_t, char *nram_temp, - char *nram_target, const float *nram_gamma, - char *nram_output, const float alpha, +__mlu_func__ void coreCompute(int8_t *nram_input, const T *nram_weight, + const float *nram_flt_min, int8_t *nram_pt, + int8_t *nram_alpha_t, int8_t *nram_temp, + int8_t *nram_target, const float *nram_gamma, + int8_t *nram_output, const float alpha, const int32_t compute_num, const int32_t deal_n, const int32_t total_c, const bool pingpong_flag, const int32_t nram_offset, @@ -151,7 +147,7 @@ __mlu_func__ void coreCompute(char *nram_input, const T *nram_weight, if (sizeof(T) == sizeof(half)) { const int32_t compute_size = compute_num * sizeof(float); - char *nram_input_load = nram_input + compute_size / 2; + int8_t *nram_input_load = nram_input + compute_size / 2; __bang_half2float((float *)nram_input, (half *)nram_input_load, compute_num); } @@ -160,11 +156,7 @@ __mlu_func__ void coreCompute(char *nram_input, const T *nram_weight, __bang_write_value((float *)nram_alpha_t, compute_num, (float)(alpha - 1.0)); // 1. pt = 1 - sigmoid(x) -#if __BANG_ARCH__ >= 372 __mluop_sigmoid((float *)nram_pt, (float *)nram_input, NULL, 0, compute_num); -#else - sigmoid((float *)nram_pt, (float *)nram_input, compute_num); -#endif __bang_mul_scalar((float *)nram_pt, (float *)nram_pt, (float)(-1), compute_num); __bang_add_scalar((float *)nram_pt, (float *)nram_pt, (float)1, compute_num); @@ -302,12 +294,12 @@ __mlu_func__ void focalLossSigmoidBackwardBlock( T *base_addr_output = output + taskId * num_per_core; // nram addr - char *nram_input = (char *)nram_buffer; - char *nram_pt = nram_input + compute_size; - char *nram_alpha_t = nram_pt + compute_size; - char *nram_temp = nram_alpha_t + compute_size; - char *nram_output = nram_temp + compute_size; - char *nram_target = nram_output + compute_size; + int8_t *nram_input = (int8_t *)nram_buffer; + int8_t *nram_pt = nram_input + compute_size; + int8_t *nram_alpha_t = nram_pt + compute_size; + int8_t *nram_temp = nram_alpha_t + compute_size; + int8_t *nram_output = nram_temp + compute_size; + int8_t *nram_target = nram_output + compute_size; float *nram_flt_min = NULL; float *nram_gamma = NULL; T *nram_weight = NULL; diff --git a/kernels/focal_loss_sigmoid/focal_loss_sigmoid_forward_union1.mlu b/kernels/focal_loss_sigmoid/focal_loss_sigmoid_forward_union1.mlu index 92bf121e1..1418a9b2a 100644 --- a/kernels/focal_loss_sigmoid/focal_loss_sigmoid_forward_union1.mlu +++ b/kernels/focal_loss_sigmoid/focal_loss_sigmoid_forward_union1.mlu @@ -31,7 +31,7 @@ #define PING 0 #define PONG 1 -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; namespace forward { @@ -44,27 +44,27 @@ namespace forward { * |----------|------------------------------------| */ template -__mlu_func__ void inplaceConvert(char *nram_ptr, const int32_t count) { +__mlu_func__ void inplaceConvert(int8_t *nram_ptr, const int32_t count) { // no need to convert when SrcType and DstType are the same } template <> -__mlu_func__ void inplaceConvert(char *nram_ptr, +__mlu_func__ void inplaceConvert(int8_t *nram_ptr, const int32_t count) { __bang_float2half_rd((half *)nram_ptr, (float *)nram_ptr, PAD_UP(count, COMPUTE_COUNT_ALIGN)); } template <> -__mlu_func__ void inplaceConvert(char *nram_ptr, +__mlu_func__ void inplaceConvert(int8_t *nram_ptr, const int32_t count) { __bang_half2float((float *)nram_ptr, (half *)nram_ptr + count, PAD_UP(count, COMPUTE_COUNT_ALIGN)); } template -__mlu_func__ void loadInput(char *nram_input, T *dram_input, const int32_t size, - const int32_t dst_stride = 0, +__mlu_func__ void loadInput(int8_t *nram_input, T *dram_input, + const int32_t size, const int32_t dst_stride = 0, const int32_t src_stride = 0, const int32_t count = 1) { if (dst_stride == src_stride) { @@ -76,7 +76,7 @@ __mlu_func__ void loadInput(char *nram_input, T *dram_input, const int32_t size, } template <> -__mlu_func__ void loadInput(char *nram_input, half *dram_input, +__mlu_func__ void loadInput(int8_t *nram_input, half *dram_input, const int32_t size, const int32_t dst_stride, const int32_t src_stride, const int32_t count) { @@ -93,7 +93,7 @@ __mlu_func__ void loadInput(char *nram_input, half *dram_input, } template -__mlu_func__ void loadWeight(char *nram_input, T *dram_input, const int32_t t, +__mlu_func__ void loadWeight(int8_t *nram_input, T *dram_input, const int32_t t, const int32_t c, const int32_t has_weight, const int32_t partition_nc) { if (has_weight && partition_nc && t >= 0 && t < c) { @@ -102,7 +102,7 @@ __mlu_func__ void loadWeight(char *nram_input, T *dram_input, const int32_t t, } template -__mlu_func__ void storeOutput(T *dram_output, char *nram_output, +__mlu_func__ void storeOutput(T *dram_output, int8_t *nram_output, const int32_t size, const int32_t dst_stride = 0, const int32_t src_stride = 0, const int32_t count = 1) { @@ -160,7 +160,7 @@ __mlu_func__ void compute(const focalLossSigmoidPreference_t prefer, if (gamma == float(0.0)) { __bang_write_value(compute_a, deal_num, (float)1.0); } else { - __bang_log(compute_a, compute_b, deal_num); + __bang_log2(compute_a, compute_b, deal_num); __bang_mul_scalar(compute_a, compute_a, (float)gamma, deal_num); __bang_pow2(compute_a, compute_a, deal_num); } @@ -200,7 +200,7 @@ __mlu_func__ void compute(const focalLossSigmoidPreference_t prefer, if (gamma == float(0.0)) { __bang_write_value(compute_a, deal_num, (float)1.0); } else { - __bang_log(compute_a, compute_b, deal_num); + __bang_log2(compute_a, compute_b, deal_num); __bang_mul_scalar(compute_a, compute_a, (float)gamma, deal_num); __bang_pow2(compute_a, compute_a, deal_num); } @@ -232,9 +232,9 @@ __mlu_func__ void compute(const focalLossSigmoidPreference_t prefer, __bang_le_scalar(input, compute_b, (float)FLT_MAX, deal_num); __bang_float2int32((int32_t *)input, input, deal_num, 0); __nram__ int32_t table[COMPUTE_COUNT_ALIGN] = {0, (int32_t)0xffffffff}; - __bang_lut_s32((int32_t *)input, (int32_t *)input, table, deal_num, - COMPUTE_COUNT_ALIGN); // NOLINT - __bang_band((char *)compute_b, (char *)compute_b, (char *)input, + __bang_lut((int32_t *)input, (uint32_t *)input, table, (uint32_t)deal_num, + COMPUTE_COUNT_ALIGN); // NOLINT + __bang_band((int8_t *)compute_b, (int8_t *)compute_b, (int8_t *)input, sizeof(float) * deal_num); // NOLINT __bang_sub(compute_a, compute_a, compute_b, deal_num); @@ -258,9 +258,9 @@ __mlu_func__ void compute(const focalLossSigmoidPreference_t prefer, template __mlu_func__ void startPipeline( const focalLossSigmoidPreference_t prefer, const T *input, - const int32_t *target, const T *weight, char *nram_compute_a, - char *nram_compute_b, char *nram_input, char *nram_target, - char *nram_weight, char *nram_output, const int32_t has_weight, + const int32_t *target, const T *weight, int8_t *nram_compute_a, + int8_t *nram_compute_b, int8_t *nram_input, int8_t *nram_target, + int8_t *nram_weight, int8_t *nram_output, const int32_t has_weight, const int32_t partition_nc, const int32_t pingpong_offset, const int32_t pingpong_weight_offset, const int32_t c_offset_num, const int32_t n, const int32_t n_seg, const int32_t c, const int32_t c_seg, @@ -498,12 +498,12 @@ __mlu_func__ void partitionInput(const focalLossSigmoidPreference_t prefer, const int32_t pingpong_offset = (MAX_NRAM_SIZE - weight_size - compute_size) / 2; - char *nram_weight = (char *)nram_buffer; - char *nram_compute_a = nram_weight + has_weight * c_align_size; - char *nram_compute_b = nram_compute_a + split_pipeline_size; - char *nram_input = nram_compute_b + split_pipeline_size; - char *nram_output = nram_input + split_pipeline_size; - char *nram_target = nram_output + split_pipeline_size; + int8_t *nram_weight = (int8_t *)nram_buffer; + int8_t *nram_compute_a = nram_weight + has_weight * c_align_size; + int8_t *nram_compute_b = nram_compute_a + split_pipeline_size; + int8_t *nram_input = nram_compute_b + split_pipeline_size; + int8_t *nram_output = nram_input + split_pipeline_size; + int8_t *nram_target = nram_output + split_pipeline_size; startPipeline(prefer, input, target, weight, nram_compute_a, nram_compute_b, nram_input, nram_target, nram_weight, nram_output, has_weight, 0, pingpong_offset, 0, 0, n, @@ -522,12 +522,12 @@ __mlu_func__ void partitionInput(const focalLossSigmoidPreference_t prefer, (MAX_NRAM_SIZE - weight_size - compute_size) / 2; const int32_t pingpong_weight_offset = weight_size / 2; - char *nram_weight = (char *)nram_buffer; - char *nram_compute_a = nram_weight + weight_size; - char *nram_compute_b = nram_compute_a + split_pipeline_size; - char *nram_input = nram_compute_b + split_pipeline_size; - char *nram_output = nram_input + split_pipeline_size; - char *nram_target = nram_output + split_pipeline_size; + int8_t *nram_weight = (int8_t *)nram_buffer; + int8_t *nram_compute_a = nram_weight + weight_size; + int8_t *nram_compute_b = nram_compute_a + split_pipeline_size; + int8_t *nram_input = nram_compute_b + split_pipeline_size; + int8_t *nram_output = nram_input + split_pipeline_size; + int8_t *nram_target = nram_output + split_pipeline_size; const int32_t loop_num = (c + c_seg - 1) / c_seg; const int32_t partition_nc = 1; diff --git a/kernels/generate_proposals_v2/generate_proposals_v2.cpp b/kernels/generate_proposals_v2/generate_proposals_v2.cpp index 5ac2b8e83..dac9a9abb 100644 --- a/kernels/generate_proposals_v2/generate_proposals_v2.cpp +++ b/kernels/generate_proposals_v2/generate_proposals_v2.cpp @@ -51,9 +51,9 @@ static void policyFunc(mluOpHandle_t handle, cnrtDim3_t *k_dim, k_dim->z = 1; if (job < 4) { k_dim->x = 1; - *k_type = CNRT_FUNC_TYPE_BLOCK; + *k_type = cnrtFuncTypeBlock; } else { - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; k_dim->x = mluop::runtime::getCoreNumOfEachUnionCapability(handle); } return; @@ -429,7 +429,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpGenerateProposalsV2( VLOG(5) << "N : " << n; const size_t hwa = h * w * a; cnrtDim3_t k_dim; - cnrtJobType_t k_type; + cnrtFunctionType_t k_type; policyFunc(handle, &k_dim, &k_type, hwa); VLOG(5) << "Launch Kernel KernelGenerateProposalsV2 <<>>"; @@ -475,8 +475,9 @@ mluOpStatus_t MLUOP_WIN_API mluOpGenerateProposalsV2( mluOpGetSizeOfDataType(scores_desc->dtype, &data_size); const size_t indices_size = PAD_UP(n * max_k * data_size, GDRAM_ALIGN_SIZE); - void *sorted_score = (void *)((char *)workspace + tok_workspace_align_size); - void *sorted_index = (void *)((char *)sorted_score + indices_size); + void *sorted_score = + (void *)((int8_t *)workspace + tok_workspace_align_size); + void *sorted_index = (void *)((int8_t *)sorted_score + indices_size); // call cnnlTopK CALL_CNNL(cnnlTopKTensor_v3( @@ -488,7 +489,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpGenerateProposalsV2( CALL_CNNL(cnnlDestroyTensorDescriptor(sorted_score_desc)); CALL_CNNL(cnnlDestroyTensorDescriptor(sorted_index_desc)); DESTROY_CNNL_HANDLE(cnnl_handle); - void *workspace_buffer = (void *)((char *)sorted_index + indices_size); + void *workspace_buffer = (void *)((int8_t *)sorted_index + indices_size); CHECK_RETURN( "[mluOpGenerateProposalsV2]", KernelGenerateProposalsV2( diff --git a/kernels/generate_proposals_v2/generate_proposals_v2_nms_utils.h b/kernels/generate_proposals_v2/generate_proposals_v2_nms_utils.h index 40bd1c1e7..dad6fd156 100644 --- a/kernels/generate_proposals_v2/generate_proposals_v2_nms_utils.h +++ b/kernels/generate_proposals_v2/generate_proposals_v2_nms_utils.h @@ -25,8 +25,8 @@ #define PROPOSAL_NRAM_SIZE MAX_NRAM_SIZE #define CALC_AREA_NRAM_FLT_CAP CALC_AREA_NRAM_SIZE / sizeof(float) -__nram__ char nram_buffer[PROPOSAL_NRAM_SIZE]; -__mlu_shared__ char sram_buffer[MAX_SRAM_SIZE]; +__nram__ int8_t nram_buffer[PROPOSAL_NRAM_SIZE]; +__mlu_shared__ int8_t sram_buffer[MAX_SRAM_SIZE]; #define FLOAT_MIN_GPV2 (-(float)FLT_MAX) @@ -151,14 +151,12 @@ __mlu_func__ void storeResult(T *max_box, T *nram_save, T *&output_boxes_tmp, (float(max_box[0]) <= FLOAT_MIN_GPV2) || keep == nms_num - 1) { if (nram_save_count != 0) { if (clusterId == 0 && coreId == 0) { - pvLock(); // x1, y1, x2, y2 __memcpy(output_boxes_tmp, nram_save + 1, 4 * sizeof(T), NRAM2GDRAM, 4 * sizeof(T), 5 * sizeof(T), nram_save_count - 1); // score __memcpy(output_scores_tmp, nram_save, sizeof(T), NRAM2GDRAM, sizeof(T), 5 * sizeof(T), nram_save_count - 1); - pvUnlock(); output_boxes_tmp += nram_save_count * 4; output_scores_tmp += nram_save_count; nram_save_count = 0; diff --git a/kernels/generate_proposals_v2/generate_proposals_v2_union_default.mlu b/kernels/generate_proposals_v2/generate_proposals_v2_union_default.mlu index ef7b68109..6ea808a00 100644 --- a/kernels/generate_proposals_v2/generate_proposals_v2_union_default.mlu +++ b/kernels/generate_proposals_v2/generate_proposals_v2_union_default.mlu @@ -169,11 +169,7 @@ __mlu_func__ void getKthScore(const T *intput_scores_ptr, T *workspace, if (taskId == 0) { __memcpy(workspace, intput_scores_ptr, HWA * sizeof(T), GDRAM2GDRAM); -#if __BANG_ARCH__ >= 322 const int memory_block = 2; -#else - const int memory_block = 3; -#endif const int limit = PROPOSAL_NRAM_SIZE / memory_block; const int max_seg_num = FLOOR_ALIGN(limit / sizeof(T), ALIGN_NUM); @@ -193,13 +189,7 @@ __mlu_func__ void getKthScore(const T *intput_scores_ptr, T *workspace, __memcpy(scores, workspace + seg_id * max_seg_num, actual_num * sizeof(T), GDRAM2NRAM); -#if __BANG_ARCH__ >= 322 __bang_eq_scalar(mask_eq, scores, k_score[0], actual_num_align); -#else - T *tmp = mask_eq + max_seg_num; - __bang_write_value(tmp, actual_num_align, k_score[0]); - __bang_eq(mask_eq, scores, tmp, actual_num_align); -#endif for (int i = actual_num; i < actual_num_align; i++) { mask_eq[i] = 0; @@ -381,14 +371,8 @@ __mlu_func__ void filterBoxes(T *proposal_scores, T *proposal_boxes, } // cx = box[0] + 0.5 * w, cy = box[1] + 0.5 * h -#if __BANG_ARCH__ >= 322 __bang_fusion(FUSION_FMA, cx, w, (T)0.5, xmin, align_count, align_count); __bang_fusion(FUSION_FMA, cy, h, (T)0.5, ymin, align_count, align_count); -#else - __bang_mul_scalar(cx, w, (T)0.5, align_count * 2); - __bang_add(cx, xmin, cx, align_count); - __bang_add(cy, ymin, cy, align_count); -#endif float real_min_size = min_size > 1.0 ? min_size : 1.0; // mask_tmp1 = w >= min_size ? 1 : 0; @@ -463,22 +447,16 @@ __mlu_func__ void createAndRemoveBox( const T *variances_ptr, T *workspace, const T k_score, const int HWA, const int pre_nms_top_n, const T min_size, const bool pixel_offset, const bool need_collect, int *proposals_num) { - // nram n = max_seg_num, transpose: 200 32N, 300 4N + // nram n = max_seg_num, transpose: 300 4N // | scores | anchors | var | deltals | proposals | ge_mask | nram | // MLU300 // | N | 4N | 4N | 4N | 4N | N | 10N | - // MLU200 - // | N | 4N | 4N | 4N | trans_buffer 32N | // workspace // | output_scores | output_boxes | scores_tmp | boxes_tmp | collect_num | // | HWA | 4*HWA | HWA | 4*HWA | taskDim | -#if __BANG_ARCH__ >= 300 const int memory_block = 28; -#else - const int memory_block = 45; -#endif const int limit = PROPOSAL_NRAM_SIZE / memory_block; int max_seg_num = 0; int repeat = 0; diff --git a/kernels/kernel.h b/kernels/kernel.h index 973fd8df7..d1a6e96fb 100644 --- a/kernels/kernel.h +++ b/kernels/kernel.h @@ -135,18 +135,7 @@ #endif // maximum integer that can be represented by float -#if __BANG_ARCH__ >= 322 #define MAX_INT2FLOAT_EXACT (powf(2, 24)) #define NEG_MAX_INT2FLOAT_EXACT (-powf(2, 24)) -#else -#define MAX_INT2FLOAT_EXACT (powf(2, 23) - 1) -#define NEG_MAX_INT2FLOAT_EXACT (-powf(2, 23)) -#endif - -#define MLU_KERNEL_ASSERT(cond, message) \ - if (!(cond)) { \ - __assert_fail(message, __FILE__, static_cast(__LINE__), \ - __func__); \ - } #endif // KERNELS_KERNEL_H_ diff --git a/kernels/lgamma/lgamma.cpp b/kernels/lgamma/lgamma.cpp index 657b7f267..662259c21 100644 --- a/kernels/lgamma/lgamma.cpp +++ b/kernels/lgamma/lgamma.cpp @@ -63,7 +63,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpLgamma(mluOpHandle_t handle, VLOG(5) << "[mluOpLgamma] launch kernel policyFUnc[" << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << "]"; - int element_num = mluOpGetTensorElementNum(x_desc); + size_t element_num = mluOpGetTensorElementNum(x_desc); if (handle->arch < MLUOP_MLU370) { LOG(ERROR) << "[mluOpLgamma] now only support ARCH >= \n"; return MLUOP_STATUS_ARCH_MISMATCH; diff --git a/kernels/lgamma/lgamma.h b/kernels/lgamma/lgamma.h index 0125acdd3..67fdca009 100644 --- a/kernels/lgamma/lgamma.h +++ b/kernels/lgamma/lgamma.h @@ -28,7 +28,7 @@ mluOpStatus_t MLUOP_WIN_API Kernel3StagePipelineLgamma( cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - mluOpDataType_t d_type, const void *x, void *y, const int num); + mluOpDataType_t d_type, const void *x, void *y, const size_t num); mluOpStatus_t MLUOP_WIN_API Kernel3StagePipelineWithStrideLgamma( const cnrtDim3_t k_dim, const cnrtFunctionType_t k_type, diff --git a/kernels/lgamma/lgamma_block.mlu b/kernels/lgamma/lgamma_block.mlu index 8bc85497b..f3366189c 100644 --- a/kernels/lgamma/lgamma_block.mlu +++ b/kernels/lgamma/lgamma_block.mlu @@ -31,7 +31,7 @@ #define AUX_N 3 -__nram__ char nram_buffer[UNARY_NRAM_SIZE]; +__nram__ int8_t nram_buffer[UNARY_NRAM_SIZE]; const float epsilon = 1e-15; const uint32_t inf_float_value = 0x7f800000; @@ -46,10 +46,10 @@ __mlu_func__ void mux2(float *dst, float *a, float *b, float *sel, uint32_t sz) { __bang_srl((uint32_t *)sel, (uint32_t *)sel, 29, sz); __bang_mul_scalar((int32_t *)sel, (int32_t *)sel, -1, sz); - __bang_band((char *)dst, (char *)a, (char *)sel, sz * sizeof(float)); - __bang_bnot((char *)sel, (char *)sel, sz * sizeof(float)); - __bang_band((char *)sel, (char *)b, (char *)sel, sz * sizeof(float)); - __bang_bor((char *)dst, (char *)dst, (char *)sel, sz * sizeof(float)); + __bang_band((int8_t *)dst, (int8_t *)a, (int8_t *)sel, sz * sizeof(float)); + __bang_bnot((int8_t *)sel, (int8_t *)sel, sz * sizeof(float)); + __bang_band((int8_t *)sel, (int8_t *)b, (int8_t *)sel, sz * sizeof(float)); + __bang_bor((int8_t *)dst, (int8_t *)dst, (int8_t *)sel, sz * sizeof(float)); } __mlu_func__ void isFinite(float *dst, float *src, uint32_t sz) { @@ -65,7 +65,7 @@ __mlu_func__ void isInf(float *dst, float *src, uint32_t sz) { } __mlu_func__ void logHp(float *dst, float *src, uint32_t sz) { - __bang_log(dst, src, sz); + __bang_log2(dst, src, sz); __bang_mul_scalar(dst, dst, log_e_2, sz); } @@ -199,8 +199,8 @@ __mlu_func__ void auxFunc3LgammaFloat( } template -__mlu_func__ void computeLgammaFloat(char *nram_output, char *nram_input, - char *auxiliary_a, char *auxiliary_b, +__mlu_func__ void computeLgammaFloat(int8_t *nram_output, int8_t *nram_input, + int8_t *auxiliary_a, int8_t *auxiliary_b, size_t deal_num, size_t actual_num) { float *aux_array[AUX_N]; for (size_t i = 0; i < AUX_N; i++) { @@ -228,8 +228,8 @@ __mlu_func__ void auxFunc3LgammaHalf(size_t &output_input_gap, } template -__mlu_func__ void computeLgammaHalf(char *nram_output, char *nram_input, - char *auxiliary_a, char *auxiliary_b, +__mlu_func__ void computeLgammaHalf(int8_t *nram_output, int8_t *nram_input, + int8_t *auxiliary_a, int8_t *auxiliary_b, size_t deal_num, size_t actual_num) { float *aux_array[AUX_N]; for (size_t i = 0; i < AUX_N; i++) { @@ -253,13 +253,13 @@ UNARY_OP_KERNEL_3PIPELINE_WITH_STRIDE_IMPLE(Lgamma, Half); mluOpStatus_t MLUOP_WIN_API Kernel3StagePipelineLgamma( const cnrtDim3_t k_dim, const cnrtFunctionType_t k_type, const cnrtQueue_t queue, const mluOpDataType_t d_type, const void *x, - void *y, const int32_t num) { + void *y, const size_t num) { if (d_type == MLUOP_DTYPE_FLOAT) { KERNEL_CHECK(MLUBlockKernel3StagePipelineLgammaFloat - <<>>((char *)x, (char *)y, num);); + <<>>((int8_t *)x, (int8_t *)y, num);); } else { // d_type == MLUOP_DTYPE_HALF KERNEL_CHECK(MLUBlockKernel3StagePipelineLgammaHalf - <<>>((char *)x, (char *)y, num);); + <<>>((int8_t *)x, (int8_t *)y, num);); } return MLUOP_STATUS_SUCCESS; } @@ -270,11 +270,13 @@ mluOpStatus_t MLUOP_WIN_API Kernel3StagePipelineWithStrideLgamma( mluop::TensorShape x_shape, void *y, mluop::TensorShape y_shape, size_t element_num) { if (d_type == MLUOP_DTYPE_FLOAT) { - KERNEL_CHECK(MLUBlockKernel3StagePipelineLgammaFloat - <<>>((char *)x, (char *)y, element_num)); + KERNEL_CHECK( + MLUBlockKernel3StagePipelineLgammaFloat + <<>>((int8_t *)x, (int8_t *)y, element_num)); } else { - KERNEL_CHECK(MLUBlockKernel3StagePipelineLgammaHalf - <<>>((char *)x, (char *)y, element_num)); + KERNEL_CHECK( + MLUBlockKernel3StagePipelineLgammaHalf + <<>>((int8_t *)x, (int8_t *)y, element_num)); } return MLUOP_STATUS_SUCCESS; } diff --git a/kernels/log/log_union1.mlu b/kernels/log/log_union1.mlu index f8ec31966..279568bd9 100644 --- a/kernels/log/log_union1.mlu +++ b/kernels/log/log_union1.mlu @@ -33,9 +33,9 @@ #define LOG_RECOVER -27.6310211159285482 __nram__ float nram_tmp[NFU_ALIGN_SIZE]; -__nram__ char nram_buffer[UNARY_NRAM_SIZE]; +__nram__ int8_t nram_buffer[UNARY_NRAM_SIZE]; #if __BANG_ARCH__ != 520 -__mlu_shared__ char sram_buffer[UNARY_SRAM_SIZE]; +__mlu_shared__ int8_t sram_buffer[UNARY_SRAM_SIZE]; #endif template @@ -55,11 +55,11 @@ __mlu_func__ void auxFunc3LogFloat(size_t &output_input_gap, } template -__mlu_func__ void computeLogFloat(char *nram_output, char *nram_input, - char *auxiliary_a, char *auxiliary_b, +__mlu_func__ void computeLogFloat(int8_t *nram_output, int8_t *nram_input, + int8_t *auxiliary_a, int8_t *auxiliary_b, size_t deal_num, size_t actual_num, float coef) { - __bang_log((float *)nram_output, (float *)nram_input, actual_num); + __bang_log2((float *)nram_output, (float *)nram_input, actual_num); __bang_mul_scalar((float *)nram_output, (float *)nram_output, (float)coef, deal_num); } @@ -81,12 +81,12 @@ __mlu_func__ void auxFunc3LogHalf(size_t &output_input_gap, } template -__mlu_func__ void computeLogHalf(char *nram_output, char *nram_input, - char *auxiliary_a, char *auxiliary_b, +__mlu_func__ void computeLogHalf(int8_t *nram_output, int8_t *nram_input, + int8_t *auxiliary_a, int8_t *auxiliary_b, size_t deal_num, size_t actual_num, float coef) { __bang_half2float((float *)nram_output, (half *)nram_input, deal_num); - __bang_log((float *)nram_output, (float *)nram_output, actual_num); + __bang_log2((float *)nram_output, (float *)nram_output, actual_num); __bang_mul_scalar((float *)nram_output, (float *)nram_output, coef, deal_num); __mluop_float2half((half *)nram_output, (float *)nram_output, deal_num); } @@ -135,12 +135,14 @@ mluOpStatus_t MLUOP_WIN_API Kernel3StagePipelineLog( const void *x, void *y, size_t num, float coef) { // launch kernel if (d_type == mluOpDataType_t::MLUOP_DTYPE_FLOAT) { - KERNEL_CHECK(MLUBlockKernel3StagePipelineLogFloat - <<>>((char *)x, (char *)y, num, coef)); + KERNEL_CHECK( + MLUBlockKernel3StagePipelineLogFloat + <<>>((int8_t *)x, (int8_t *)y, num, coef)); } else { // half - KERNEL_CHECK(MLUBlockKernel3StagePipelineLogHalf - <<>>((char *)x, (char *)y, num, coef)); + KERNEL_CHECK( + MLUBlockKernel3StagePipelineLogHalf + <<>>((int8_t *)x, (int8_t *)y, num, coef)); } return MLUOP_STATUS_SUCCESS; } @@ -151,11 +153,13 @@ mluOpStatus_t MLUOP_WIN_API Kernel5StagePipelineLog( const void *x, void *y, size_t num, float coef) { // launch kernel if (d_type == mluOpDataType_t::MLUOP_DTYPE_FLOAT) { - KERNEL_CHECK(MLUBlockKernel5StagePipelineLogFloat - <<>>((char *)x, (char *)y, num, coef)); + KERNEL_CHECK( + MLUBlockKernel5StagePipelineLogFloat + <<>>((int8_t *)x, (int8_t *)y, num, coef)); } else { - KERNEL_CHECK(MLUBlockKernel5StagePipelineLogHalf - <<>>((char *)x, (char *)y, num, coef)); + KERNEL_CHECK( + MLUBlockKernel5StagePipelineLogHalf + <<>>((int8_t *)x, (int8_t *)y, num, coef)); } return MLUOP_STATUS_SUCCESS; } diff --git a/kernels/logspace/logspace.cpp b/kernels/logspace/logspace.cpp new file mode 100644 index 000000000..ea7963e05 --- /dev/null +++ b/kernels/logspace/logspace.cpp @@ -0,0 +1,115 @@ +/************************************************************************* + * Copyright (C) [2024] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "logspace.h" +#include "core/context.h" +#include "core/gen_case.h" +#include "core/logging.h" +#include "core/runtime/device.h" +#include "core/tensor.h" +#include "core/type.h" +#include "kernels/unary_op/unary_op_host.h" + +static void LogspacePolicyFunc(const mluOpHandle_t &handle, const int64_t steps, + cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) { + *k_type = CNRT_FUNC_TYPE_BLOCK; + uint32_t cluster_num = + mluop::runtime::getCoreNumOfEachUnionCapability(handle); + uint32_t core_in_cluster = handle->core_num_per_cluster; + uint32_t core_max = cluster_num * core_in_cluster; + uint32_t core_used = core_max > steps ? steps : core_max; + k_dim->x = core_used; + k_dim->y = 1; + k_dim->z = 1; +} + +static inline bool isSupportType(const mluOpDataType_t check_type, + const mluOpDataType_t support_type[], + const int len) { + for (int i = 0; i < len; ++i) { + if (check_type == support_type[i]) { + return true; + } + } + return false; +} + +mluOpStatus_t LogspaceParamCheck(const mluOpHandle_t &handle, const float start, + const float end, const int64_t steps, + const float base, + const mluOpTensorDescriptor_t &res_desc, + const void *res) { + PARAM_CHECK("[mluOpLogspace]", handle != nullptr); + PARAM_CHECK("[mluOpLogspace]", res_desc != nullptr); + PARAM_CHECK("[mluOpLogspace]", steps >= 0); + size_t element_num = mluOpGetTensorElementNum(res_desc); + PARAM_CHECK("[mluOpLogspace]", steps <= element_num); + mluOpDataType_t support_type[3] = {MLUOP_DTYPE_FLOAT, MLUOP_DTYPE_HALF, + MLUOP_DTYPE_INT32}; + if (!isSupportType(res_desc->dtype, support_type, 3)) { + LOG(ERROR) << "[mluOpLogspace]" + << ":res_desc's data type is not supported."; + return MLUOP_STATUS_BAD_PARAM; + } + return MLUOP_STATUS_SUCCESS; +} + +mluOpStatus_t MLUOP_WIN_API +mluOpLogspace(mluOpHandle_t handle, const float start, const float end, + const int64_t steps, const float base, + const mluOpTensorDescriptor_t res_desc, void *res) { + // param check + mluOpStatus_t param_check = + LogspaceParamCheck(handle, start, end, steps, base, res_desc, res); + if (param_check != MLUOP_STATUS_SUCCESS) { + return param_check; + } + + if (steps == 0) { + return MLUOP_STATUS_SUCCESS; + } + + // generate prototxt + if (MLUOP_GEN_CASE_ON_NEW) { + GEN_CASE_START("logspace", "LOGSPACE"); + GEN_CASE_HANDLE(handle); + GEN_CASE_OP_PARAM_SINGLE(0, "logspace", "start", start); + GEN_CASE_OP_PARAM_SINGLE(1, "logspace", "end", end); + GEN_CASE_OP_PARAM_SINGLE(2, "logspace", "steps", steps); + GEN_CASE_OP_PARAM_SINGLE(3, "logspace", "base", base); + GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0); + } + + // policy select + cnrtDim3_t k_dim; + cnrtFunctionType_t k_type; + LogspacePolicyFunc(handle, steps, &k_dim, &k_type); + VLOG(5) << "[mluOpLogspace] launch kernel policyFUnc[" << k_dim.x << ", " + << k_dim.y << ", " << k_dim.z << "]"; + + VLOG(5) << "kernel KernelLogspace."; + CHECK_RETURN("[mluOpLogspace] ", + KernelLogspace(k_dim, k_type, handle->queue, res_desc->dtype, + start, end, steps, base, res)); + GEN_CASE_END(); + return MLUOP_STATUS_SUCCESS; +} diff --git a/kernels/logspace/logspace.h b/kernels/logspace/logspace.h new file mode 100644 index 000000000..89e2d3ce1 --- /dev/null +++ b/kernels/logspace/logspace.h @@ -0,0 +1,33 @@ +/************************************************************************* + * Copyright (C) [2024] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#ifndef KERNELS_LOGSPACE_LOGSPACE_H +#define KERNELS_LOGSPACE_LOGSPACE_H + +#include "mlu_op.h" + +mluOpStatus_t MLUOP_WIN_API KernelLogspace( + const cnrtDim3_t k_dim, const cnrtFunctionType_t k_type, + const cnrtQueue_t queue, const mluOpDataType_t d_type, const float start, + const float end, const int64_t steps, const float base, void *res); + +#endif // KERNELS_LOGSPACE_LOGSPACE_H diff --git a/kernels/logspace/logspace_block.mlu b/kernels/logspace/logspace_block.mlu new file mode 100644 index 000000000..5976f6e21 --- /dev/null +++ b/kernels/logspace/logspace_block.mlu @@ -0,0 +1,398 @@ +/************************************************************************* + * Copyright (C) [2024] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "logspace.h" +#include "core/logging.h" +#include "kernels/debug.h" +#include "kernels/kernel.h" +#include "kernels/unary_op/unary_op_3pipeline.h" +#include "kernels/unary_op/unary_op_5pipeline.h" + +#define ALIGN_NUM_LOGSPACE 32 +#define LUT_TABEL_LENGTH 64 +#define LOGSPACE_NRAM_USED (MAX_NRAM_SIZE - 4 * LUT_TABEL_LENGTH * sizeof(int)) + +__nram__ int8_t nram_buffer[LOGSPACE_NRAM_USED]; + +template +__mlu_func__ void setResult(T *res, unsigned int elem_count, T value) { + const size_t per_core_num = elem_count / taskDim; + int32_t remain_num = elem_count % taskDim; + const size_t cur_core_num = + taskId < remain_num ? per_core_num + 1 : per_core_num; + size_t cur_core_offset = + taskId * per_core_num + (taskId < remain_num ? taskId : remain_num); + if (cur_core_num > 0) { + __gdramset(res + cur_core_offset, cur_core_num, value); + } +} + +template +__mlu_func__ void float2DifferentType(float *result_float, T *result, + const int num) { + if (std::is_same::value) { + float *result_ge_half_max = result_float + num; + __bang_abs(result_ge_half_max, result_float, num); + int half_max = 65504; + __bang_ge_scalar(result_ge_half_max, result_ge_half_max, half_max, num); + __bang_float2int16_rd((int16_t *)result_ge_half_max, result_ge_half_max, + num, 0); + __nram__ int16_t table_half_all1[LUT_TABEL_LENGTH] = {0, (int16_t)0x7cff}; + __bang_lut((int16_t *)result_ge_half_max, (uint16_t *)result_ge_half_max, + (int16_t *)table_half_all1, num, LUT_TABEL_LENGTH); + __bang_float2half_tz((half *)result, result_float, num); + __bang_bor((int16_t *)result, (int16_t *)result, + (int16_t *)result_ge_half_max, num); + __bang_ge_scalar((int16_t *)result_ge_half_max, + (int16_t *)result_ge_half_max, 1, num); + __nram__ int16_t table_half_inf[LUT_TABEL_LENGTH] = {(int16_t)0xffff, + (int16_t)0xfc00}; + __bang_lut((int16_t *)result_ge_half_max, (uint16_t *)result_ge_half_max, + (int16_t *)table_half_inf, num, LUT_TABEL_LENGTH); + __bang_band((int16_t *)result, (int16_t *)result, + (int16_t *)result_ge_half_max, num); + } + if (std::is_same::value) { + __bang_float2int32_tz((int *)result, result_float, num, 0); + } +} + +template +__mlu_func__ void dealStep1(const float start, const float base, T *res) { + if (taskId == 0) { + *res = (T)powf(base, start); + } +} + +template +__mlu_func__ void dealAll1Nan(const float start, const float end, + const int64_t steps, T *res) { + float all_the_result = 1; + if ((abs(start) == INFINITY) && (abs(end) == INFINITY)) { + all_the_result = NAN; + } + setResult(res, steps, (T)all_the_result); +} + +template +__mlu_func__ void dealStartEndInfinity(const float start, const float end, + const int64_t steps, const float base, + T *res) { + if (abs(start) == INFINITY) { + if (steps > 2) { + float part_result = powf(base, start); + if (std::is_same::value) { + if (base < 0) { + part_result = INFINITY; + } else { + part_result = NAN; + } + } + setResult(res + steps / 2, (steps - 1) / 2, (T)part_result); + } + setResult(res, steps / 2, (T)NAN); + setResult(res + steps - 1, 1, (T)NAN); + } else if (abs(end) == INFINITY) { + if (steps > 2) { + float part_result = powf(base, end); + if (std::is_same::value) { + if (base < 0) { + part_result = INFINITY; + } else { + part_result = NAN; + } + } + setResult(res + 1, (steps - 1) / 2, (T)part_result); + } + setResult(res, 1, (T)NAN); + setResult(res + (steps + 1) / 2, steps / 2, (T)NAN); + } +} + +template +__mlu_func__ void dealBase0(const float start, const float end, + const int64_t steps, T *res) { + if (start * end > 0) { + if (start > 0) { + setResult(res, steps, (T)0); + } else { + setResult(res, steps, (T)INFINITY); + } + } else { + float step = (float)(end - start) / (steps - 1); + int numbers_form_start_to_0 = abs(start / step) + 1; + if (start > 0) { + setResult(res, numbers_form_start_to_0, (T)0); + setResult(res + numbers_form_start_to_0, steps - numbers_form_start_to_0, + (T)INFINITY); + } else { + setResult(res, numbers_form_start_to_0, (T)INFINITY); + setResult(res + numbers_form_start_to_0, steps - numbers_form_start_to_0, + (T)0); + } + } +} + +template +__mlu_func__ void dealStep0(const float start, const float end, + const int64_t steps, const float base, T *res) { + float base_start = powf(base, start); + int half_max = 65504; + if (abs(base_start) > half_max && std::is_same::value) { + base_start = INFINITY; + } + float base_end = powf(base, end); + if (abs(base_end) > half_max && std::is_same::value) { + base_end = INFINITY; + } + // if steps >= 65520 * 2 + 1, the result from index 65520 will be nan, and the + // length is steps - 65520 * 2 + if ((steps < 65520 * 2 + 1) || + (abs((float)(end - start) / (steps - 1) == 0))) { + setResult(res, steps / 2, (T)base_start); + setResult(res + steps / 2, (steps + 1) / 2, (T)base_end); + + } else { + setResult(res, 65520, (T)base_start); + setResult(res + 65520, steps - 65520 * 2, (T)NAN); + setResult(res + steps - 65520, 65520, (T)base_end); + } +} + +template +__mlu_func__ void dealBaseNegative(const float start, const float end, + const int64_t steps, const float base, + T *res) { + const int32_t max_deal_num = + PAD_DOWN(LOGSPACE_NRAM_USED / 5 / sizeof(float), ALIGN_NUM_LOGSPACE); + float *log2_result = (float *)nram_buffer; + float *result_float = (float *)nram_buffer; + T *result = (T *)nram_buffer; + float *floor_y = (float *)nram_buffer + 2 * max_deal_num; + float *y_copy = (float *)nram_buffer + 3 * max_deal_num; + int *all_int_1 = (int *)nram_buffer + 4 * max_deal_num; + __bang_write_value(all_int_1, max_deal_num, (int)1); + const size_t per_core_num = steps / taskDim; + int32_t remain_num = steps % taskDim; + const size_t cur_core_num = + taskId < remain_num ? per_core_num + 1 : per_core_num; + size_t cur_core_offset = + taskId * per_core_num + (taskId < remain_num ? taskId : remain_num); + const int32_t repeat_steps = cur_core_num / max_deal_num; + const int32_t remain_steps = cur_core_num % max_deal_num; + + float base_log; + base_log = log2f(-base); + float step = (float)(end - start) / (steps - 1); + + for (int step_i = 0; step_i <= repeat_steps; step_i++) { + if (step_i == repeat_steps && remain_steps == 0) { + break; + } + const int32_t actual_deal_num = + step_i == repeat_steps ? remain_steps : max_deal_num; + const int64_t loop_offset = cur_core_offset + step_i * max_deal_num; + + const int64_t halfway = steps / 2; + if (loop_offset + actual_deal_num < halfway) { + __mluop_get_indices(log2_result, loop_offset, actual_deal_num); + __bang_mul_scalar(log2_result, log2_result, step, actual_deal_num); + __bang_add_scalar(log2_result, log2_result, (float)start, + actual_deal_num); + } else if (loop_offset >= halfway) { + __mluop_get_indices(log2_result, loop_offset + 1 - steps, + actual_deal_num); + __bang_mul_scalar(log2_result, log2_result, step, actual_deal_num); + __bang_add_scalar(log2_result, log2_result, (float)end, actual_deal_num); + } else { + const int64_t offset2halfway = halfway - loop_offset; + if (offset2halfway > 0) { + __mluop_get_indices(log2_result, loop_offset, offset2halfway); + __bang_mul_scalar(log2_result, log2_result, step, offset2halfway); + __bang_add_scalar(log2_result, log2_result, (float)start, + offset2halfway); + } + const int64_t deal_num_sub_halfway = actual_deal_num - offset2halfway; + if (deal_num_sub_halfway > 0) { + __mluop_get_indices(log2_result + offset2halfway, -(steps - 1) / 2, + deal_num_sub_halfway); + __bang_mul_scalar(log2_result + offset2halfway, + log2_result + offset2halfway, step, + deal_num_sub_halfway); + __bang_add_scalar(log2_result + offset2halfway, + log2_result + offset2halfway, (float)end, + deal_num_sub_halfway); + } + } + + __bang_floor(floor_y, log2_result, actual_deal_num); + __bang_eq(floor_y, floor_y, log2_result, actual_deal_num); + __bang_float2int32((int *)floor_y, floor_y, actual_deal_num, 0); + __bang_move(y_copy, log2_result, sizeof(float) * actual_deal_num); + __bang_float2int32((int *)y_copy, y_copy, actual_deal_num, 0); + __bang_band((int *)y_copy, (int *)y_copy, all_int_1, actual_deal_num); + __bang_band((int *)y_copy, (int *)y_copy, (int *)floor_y, actual_deal_num); + __nram__ uint32_t table_for_odd_or_even_power[LUT_TABEL_LENGTH] = { + 0, 0x80000000}; + __bang_lut((int32_t *)y_copy, (uint32_t *)y_copy, + (int32_t *)table_for_odd_or_even_power, actual_deal_num, + LUT_TABEL_LENGTH); + __nram__ int table_for_integer_power[LUT_TABEL_LENGTH] = {0x7fffffff, 0}; + __bang_lut((int32_t *)floor_y, (uint32_t *)floor_y, + (int32_t *)table_for_integer_power, actual_deal_num, + LUT_TABEL_LENGTH); + __bang_bor((int *)log2_result, (int *)log2_result, (int *)floor_y, + actual_deal_num); + __bang_mul_scalar(log2_result, log2_result, base_log, actual_deal_num); + __bang_pow2(result_float, log2_result, actual_deal_num); + __bang_bor((int *)result_float, (int *)result_float, (int *)y_copy, + actual_deal_num); + float2DifferentType(result_float, result, actual_deal_num); + __memcpy(res + loop_offset, result, actual_deal_num * sizeof(T), + NRAM2GDRAM); + } +} + +template +__mlu_func__ void dealNormalCase(const float start, const float end, + const int64_t steps, const float base, + T *res) { + const int32_t max_deal_num = + PAD_DOWN(LOGSPACE_NRAM_USED / 2 / sizeof(float), ALIGN_NUM_LOGSPACE); + float *log2_result = (float *)nram_buffer; + float *result_float = (float *)nram_buffer; + T *result = (T *)nram_buffer; + const size_t per_core_num = steps / taskDim; + int32_t remain_num = steps % taskDim; + const size_t cur_core_num = + taskId < remain_num ? per_core_num + 1 : per_core_num; + size_t cur_core_offset = + taskId * per_core_num + (taskId < remain_num ? taskId : remain_num); + const int32_t repeat_steps = cur_core_num / max_deal_num; + const int32_t remain_steps = cur_core_num % max_deal_num; + + float base_log; + if (base == -INFINITY) { + base_log = log2f(-base); + } else { + base_log = log2f(base); + } + float step = (float)(end - start) / (steps - 1); + + for (int step_i = 0; step_i <= repeat_steps; step_i++) { + if (step_i == repeat_steps && remain_steps == 0) { + break; + } + const int32_t actual_deal_num = + step_i == repeat_steps ? remain_steps : max_deal_num; + const int64_t loop_offset = cur_core_offset + step_i * max_deal_num; + + const int64_t halfway = steps / 2; + if (loop_offset + actual_deal_num < halfway) { + __mluop_get_indices(log2_result, loop_offset, actual_deal_num); + __bang_mul_scalar(log2_result, log2_result, step, actual_deal_num); + __bang_add_scalar(log2_result, log2_result, (float)start, + actual_deal_num); + } else if (loop_offset >= halfway) { + __mluop_get_indices(log2_result, loop_offset + 1 - steps, + actual_deal_num); + __bang_mul_scalar(log2_result, log2_result, step, actual_deal_num); + __bang_add_scalar(log2_result, log2_result, (float)end, actual_deal_num); + } else { + const int64_t offset2halfway = halfway - loop_offset; + if (offset2halfway > 0) { + __mluop_get_indices(log2_result, loop_offset, offset2halfway); + __bang_mul_scalar(log2_result, log2_result, step, offset2halfway); + __bang_add_scalar(log2_result, log2_result, (float)start, + offset2halfway); + } + const int64_t deal_num_sub_halfway = actual_deal_num - offset2halfway; + if (deal_num_sub_halfway > 0) { + __mluop_get_indices(log2_result + offset2halfway, -(steps - 1) / 2, + deal_num_sub_halfway); + __bang_mul_scalar(log2_result + offset2halfway, + log2_result + offset2halfway, step, + deal_num_sub_halfway); + __bang_add_scalar(log2_result + offset2halfway, + log2_result + offset2halfway, (float)end, + deal_num_sub_halfway); + } + } + __bang_mul_scalar(log2_result, log2_result, base_log, actual_deal_num); + __bang_pow2(result_float, log2_result, actual_deal_num); + float2DifferentType(result_float, result, actual_deal_num); + __memcpy(res + loop_offset, result, actual_deal_num * sizeof(T), + NRAM2GDRAM); + } +} + +template +__mlu_global__ void MLUKernelLogspace(const float start, const float end, + const int64_t steps, const float base, + T *res) { + float scalar_start = (T)start; + float scalar_end = (T)end; + if (steps == 1) { + dealStep1(scalar_start, base, res); + } else if ((scalar_start == 0 && scalar_end == 0) || base == 1 || + ((abs(scalar_start) == INFINITY) && + (abs(scalar_end) == INFINITY))) { + dealAll1Nan(scalar_start, scalar_end, steps, res); + } else if (abs(scalar_start) == INFINITY || abs(scalar_end) == INFINITY) { + dealStartEndInfinity(scalar_start, scalar_end, steps, base, res); + } else if (base == 0) { + dealBase0(scalar_start, scalar_end, steps, res); + } else if ((abs((float)(scalar_end - scalar_start) / (steps - 1) == 0)) || + ((std::is_same::value) && + (abs((float)(scalar_end - scalar_start) / (steps - 1)) < + 0.0009765625))) { + dealStep0(scalar_start, scalar_end, steps, base, res); + } else if (base < 0 && base != -INFINITY) { + dealBaseNegative(scalar_start, scalar_end, steps, base, res); + } else { + dealNormalCase(scalar_start, scalar_end, steps, base, res); + } + __sync(); +} + +mluOpStatus_t MLUOP_WIN_API KernelLogspace( + const cnrtDim3_t k_dim, const cnrtFunctionType_t k_type, + const cnrtQueue_t queue, const mluOpDataType_t d_type, const float start, + const float end, const int64_t steps, const float base, void *res) { + switch (d_type) { + case MLUOP_DTYPE_FLOAT: { + KERNEL_CHECK(MLUKernelLogspace<<>>( + start, end, steps, base, (float *)res)); + }; break; + case MLUOP_DTYPE_HALF: { + KERNEL_CHECK(MLUKernelLogspace<<>>( + start, end, steps, base, (half *)res)); + }; break; + case MLUOP_DTYPE_INT32: { + KERNEL_CHECK(MLUKernelLogspace<<>>( + start, end, steps, base, (int *)res)); + }; break; + default: + break; + } + return MLUOP_STATUS_SUCCESS; +} diff --git a/kernels/masked_im2col/masked_col2im_forward/masked_col2im_forward.cpp b/kernels/masked_im2col/masked_col2im_forward/masked_col2im_forward.cpp index 0505a45dd..f36668071 100644 --- a/kernels/masked_im2col/masked_col2im_forward/masked_col2im_forward.cpp +++ b/kernels/masked_im2col/masked_col2im_forward/masked_col2im_forward.cpp @@ -42,7 +42,7 @@ static void policyFunc(const mluOpHandle_t handle, const int mask_cnt, k_dim->y = (task_dim / core_limit) > cluster_limit ? cluster_limit : (task_dim / core_limit); k_dim->z = 1; - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; } static mluOpStatus_t maskedCol2imForwardPreCheck( @@ -269,8 +269,9 @@ mluOpStatus_t MLUOP_WIN_API mluOpMaskedCol2imForward( // generate mluOpMaskedCol2imForward prototxt end! mluOpDataType_t input_dtype = col_desc->dtype; void *col_workspace = workspace; - void *im_workspace = (char *)workspace + col_desc->total_tensor_size; - void *transpose_workspace = (char *)im_workspace + im_desc->total_tensor_size; + void *im_workspace = (int8_t *)workspace + col_desc->total_tensor_size; + void *transpose_workspace = + (int8_t *)im_workspace + im_desc->total_tensor_size; cnrtDim3_t k_dim; cnrtFunctionType_t k_type; diff --git a/kernels/masked_im2col/masked_col2im_forward/masked_col2im_forward_union1.mlu b/kernels/masked_im2col/masked_col2im_forward/masked_col2im_forward_union1.mlu index cf0ce7b07..6518640d2 100644 --- a/kernels/masked_im2col/masked_col2im_forward/masked_col2im_forward_union1.mlu +++ b/kernels/masked_im2col/masked_col2im_forward/masked_col2im_forward_union1.mlu @@ -29,7 +29,7 @@ #include "kernels/kernel.h" #include "kernels/utils/common.h" -__nram__ char data_nram[MAX_NRAM_SIZE]; +__nram__ int8_t data_nram[MAX_NRAM_SIZE]; template __mlu_func__ void MLUMultiKernelMaskedCol2imForward( diff --git a/kernels/masked_im2col/masked_im2col_forward/masked_im2col_forward.cpp b/kernels/masked_im2col/masked_im2col_forward/masked_im2col_forward.cpp index 2167b4860..d1bd663a7 100644 --- a/kernels/masked_im2col/masked_im2col_forward/masked_im2col_forward.cpp +++ b/kernels/masked_im2col/masked_im2col_forward/masked_im2col_forward.cpp @@ -40,7 +40,7 @@ static void policyFunc(const mluOpHandle_t handle, const int mask_cnt, k_dim->y = (task_dim / core_limit) > cluster_limit ? cluster_limit : (task_dim / core_limit); k_dim->z = 1; - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; } static mluOpStatus_t maskedIm2colForwardPreCheck( @@ -283,9 +283,9 @@ mluOpStatus_t MLUOP_WIN_API mluOpMaskedIm2colForward( mluOpDataType_t input_dtype = feature_desc->dtype; void *feature_workspace = workspace; void *data_col_workspace = - (char *)workspace + feature_desc->total_tensor_size; + (int8_t *)workspace + feature_desc->total_tensor_size; void *transpose_workspace = - (char *)data_col_workspace + data_col_desc->total_tensor_size; + (int8_t *)data_col_workspace + data_col_desc->total_tensor_size; cnrtDim3_t k_dim; cnrtFunctionType_t k_type; diff --git a/kernels/moe_dispatch/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp b/kernels/moe_dispatch/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp index 90c358431..67d4328b0 100644 --- a/kernels/moe_dispatch/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp +++ b/kernels/moe_dispatch/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp @@ -36,7 +36,7 @@ static void PolicyFunc(const mluOpHandle_t handle, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) { // union1 policy func - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; // dimx equals to num of MLU Cores in each cluster k_dim->x = mluop::runtime::getCoreNumOfEachUnionCapability(handle); // dimy equals to num of current available clusters diff --git a/kernels/moe_dispatch/moe_dispatch_backward_data/moe_dispatch_backward_data_union1.mlu b/kernels/moe_dispatch/moe_dispatch_backward_data/moe_dispatch_backward_data_union1.mlu index 4d2e9d08f..092450a60 100644 --- a/kernels/moe_dispatch/moe_dispatch_backward_data/moe_dispatch_backward_data_union1.mlu +++ b/kernels/moe_dispatch/moe_dispatch_backward_data/moe_dispatch_backward_data_union1.mlu @@ -27,9 +27,8 @@ #include "kernels/kernel.h" #include "kernels/utils/common.h" -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; -#if __BANG_ARCH__ >= 372 template static __mlu_func__ void load(T *dispatch_addr, T *nram_dispatch, const int deal_num, const int pingpong_num, @@ -126,7 +125,6 @@ static __mlu_func__ void lcs(T *base_gard_input_addr, T *base_dispatch_addr, store(gard_input_addr, nram_gard_input, rem_num, pingpong_num, repeat_num); } } -#endif template __mlu_entry__ void MLUKernelMoeDispatchBwdData1( @@ -138,7 +136,6 @@ __mlu_entry__ void MLUKernelMoeDispatchBwdData1( // locations: (samples) // dispatch: (num_experts * capacity, hidden) // grad_input: (samples, hidden) -#if __BANG_ARCH__ >= 372 if (__is_mpu()) { return; } @@ -189,7 +186,6 @@ __mlu_entry__ void MLUKernelMoeDispatchBwdData1( int rem_h = hidden_seg_num % deal_h; lcs(base_grad_input_addr, base_dispatch_addr, nram_grad_input, nram_dispatch, gates_si_value, repeat_h, rem_h, deal_h, pingpong_num); -#endif } template @@ -202,7 +198,6 @@ __mlu_entry__ void MLUKernelMoeDispatchBwdData2( // locations: (samples) // dispatch: (num_experts * capacity, hidden) // grad_input: (samples, hidden) -#if __BANG_ARCH__ >= 372 if (__is_mpu()) { return; } @@ -307,7 +302,6 @@ __mlu_entry__ void MLUKernelMoeDispatchBwdData2( } // repeat h } // repeat deal_s_num } // repeat s -#endif } mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchBwdData1( diff --git a/kernels/moe_dispatch/moe_dispatch_backward_gate/moe_dispatch_backward_gate.cpp b/kernels/moe_dispatch/moe_dispatch_backward_gate/moe_dispatch_backward_gate.cpp index 44a3b28dc..776c8b9e8 100644 --- a/kernels/moe_dispatch/moe_dispatch_backward_gate/moe_dispatch_backward_gate.cpp +++ b/kernels/moe_dispatch/moe_dispatch_backward_gate/moe_dispatch_backward_gate.cpp @@ -40,7 +40,7 @@ static void policyFunc(const mluOpHandle_t handle, const int samples, k_dim->y = 1; k_dim->z = 1; if (samples > max_core_num) { - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; } else { *k_type = mluop::runtime::getJobLimitCapabilityCnrtFuncType(handle); } diff --git a/kernels/moe_dispatch/moe_dispatch_backward_gate/moe_dispatch_backward_gate_union1.mlu b/kernels/moe_dispatch/moe_dispatch_backward_gate/moe_dispatch_backward_gate_union1.mlu index 383c97d0a..e8e9ad0eb 100644 --- a/kernels/moe_dispatch/moe_dispatch_backward_gate/moe_dispatch_backward_gate_union1.mlu +++ b/kernels/moe_dispatch/moe_dispatch_backward_gate/moe_dispatch_backward_gate_union1.mlu @@ -27,9 +27,8 @@ #include "kernels/kernel.h" #include "kernels/utils/common.h" -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; -#if __BANG_ARCH__ >= 372 template static __mlu_func__ void load(const T *input_addr, const T *dispatch_addr, T *nram_input, T *nram_dispatch, @@ -118,14 +117,12 @@ static __mlu_func__ void lcs(T *base_input_addr, T *base_dispatch_addr, __sync(); } } -#endif template __mlu_global__ void MLUKernelMoeDispatchBwdGate1( const int *indices, const int *locations, const T *input, const T *dispatch, const int samples, const int capacity, const int hidden, const int num_experts, T *workspace, T *grad_gates) { -#if __BANG_ARCH__ >= 372 if (__is_mpu()) { return; } @@ -208,7 +205,6 @@ __mlu_global__ void MLUKernelMoeDispatchBwdGate1( // store __memcpy(grad_gates, nram_grad_gates, samples * sizeof(T), NRAM2GDRAM); } -#endif } template @@ -216,7 +212,6 @@ __mlu_global__ void MLUKernelMoeDispatchBwdGate2( const int *indices, const int *locations, const T *input, const T *dispatch, const int samples, const int capacity, const int hidden, const int num_experts, T *grad_gates) { -#if __BANG_ARCH__ >= 372 if (__is_mpu()) { return; } @@ -354,7 +349,6 @@ __mlu_global__ void MLUKernelMoeDispatchBwdGate2( __memcpy(base_grad_gates + s_iter * deal_s, nram_grad_gates, deal_s_num * sizeof(T), NRAM2GDRAM); } -#endif } mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchBwdGate1( diff --git a/kernels/moe_dispatch/moe_dispatch_forward/moe_dispatch_forward.cpp b/kernels/moe_dispatch/moe_dispatch_forward/moe_dispatch_forward.cpp index 2ce40dd27..b1e3384fd 100644 --- a/kernels/moe_dispatch/moe_dispatch_forward/moe_dispatch_forward.cpp +++ b/kernels/moe_dispatch/moe_dispatch_forward/moe_dispatch_forward.cpp @@ -34,7 +34,7 @@ static void policyFunc(const mluOpHandle_t handle, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) { // block policy func - *k_type = CNRT_FUNC_TYPE_BLOCK; + *k_type = cnrtFuncTypeBlock; // dimx equals to num of mlu cores in each cluster k_dim->x = mluop::runtime::getCoreNumOfEachUnionCapability(handle); // dimy equals to num of current available clusters diff --git a/kernels/moe_dispatch/moe_dispatch_forward/moe_dispatch_forward_block.mlu b/kernels/moe_dispatch/moe_dispatch_forward/moe_dispatch_forward_block.mlu index 8e3cb502b..21ff3b928 100644 --- a/kernels/moe_dispatch/moe_dispatch_forward/moe_dispatch_forward_block.mlu +++ b/kernels/moe_dispatch/moe_dispatch_forward/moe_dispatch_forward_block.mlu @@ -28,14 +28,13 @@ #include "kernels/kernel.h" #include "kernels/utils/common.h" -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; template __mlu_global__ void MLUKernelMoeDispatchFwd( const T *gates, const int *indices, const int *locations, const T *input, const int samples, const int capacity, const int hidden, const int num_experts, T *dispatch) { -#if __BANG_ARCH__ >= 372 if (__is_mpu()) { return; } @@ -137,7 +136,6 @@ __mlu_global__ void MLUKernelMoeDispatchFwd( } } // deal s } // repeat s -#endif } mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchForward( diff --git a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward.cpp b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward.cpp index 2311a4bbb..778535274 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward.cpp +++ b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward.cpp @@ -68,7 +68,7 @@ static void policyFunc(mluOpHandle_t handle, const int32_t batch, ? cluster_limit : (total_num_align / core_limit); k_dim->z = 1; - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; } mluOpDeformAttnBackwardKernelPolicy_t msDeformAttnBackwardPolicyFunc( diff --git a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu index 03c5be4cd..21ee0b40d 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu +++ b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu @@ -35,9 +35,9 @@ #define WRAM_AVALIABLE_SIZE (__MLU_WRAM_SIZE__ * 1024) #define SRAM_AVALIABLE_SIZE (__MLU_SRAM_SIZE__ * 1024 - SRAM_REMAIN_SIZE) -__nram__ char nram_buffer[NRAM_AVALIABLE_SIZE]; -__mlu_shared__ char sram_buffer[SRAM_AVALIABLE_SIZE]; -__wram__ char wram_buffer[WRAM_AVALIABLE_SIZE]; +__nram__ int8_t nram_buffer[NRAM_AVALIABLE_SIZE]; +__mlu_shared__ int8_t sram_buffer[SRAM_AVALIABLE_SIZE]; +__wram__ int8_t wram_buffer[WRAM_AVALIABLE_SIZE]; __mlu_func__ void loadNram2Gpr(int32_t& v1, int32_t& v2, int32_t& v3, int32_t& v4, const int32_t* p1, @@ -68,7 +68,7 @@ __mlu_func__ void memPolicyBackward( T*& grad_wp_nram_stg3, // (4, total_deal_n, num_levels, num_points) int32_t*& data_offset_sram, T*& weight_polation_sram, T*& grad_wp_sram, T*& weight_attn_sram, T*& cond_point_polation_sram, T*& delta_xy_sram, - char* nram_buffer, char* sram_buffer, int32_t& max_cached_n, + int8_t* nram_buffer, int8_t* sram_buffer, int32_t& max_cached_n, int32_t& stage_1_max_deal_n, int32_t& stage_2_max_deal_n, int32_t& stage_3_max_deal_n, int32_t& mask_size, const int32_t nram_avaliable_size, const int32_t sram_avaliable_size, @@ -163,7 +163,7 @@ __mlu_func__ void backwardStageTwoLoop( T* weight_polation_sram, T* grad_wp_sram, T* weight_attn_sram, T* cond_point_polation_sram, T* delta_xy_sram, T* data_value_gdram, T* grad_output_gdram, T* grad_value_gdram, T* grad_attn_weight_gdram, - char* wram_buffer, const int32_t total_deal_n, const int32_t max_deal_n, + int8_t* wram_buffer, const int32_t total_deal_n, const int32_t max_deal_n, const int32_t input_stride_2, const int32_t input_stride_3, const int32_t output_stride_2, const int32_t num_heads, const int32_t channels, const int32_t num_levels, @@ -203,18 +203,19 @@ __mlu_func__ void backwardStageTwoLoop( if (nq_nl_np_pad8 == nq_nl_np) { int32_t bit_cond_stride_4 = 4 * bit_cond_stride; __bang_gt_bitindex((T*)bit_cond_nram, cond_nram, tmp_zero, nq_nl_np_4); - __bang_bnot((char*)bit_cond_reverse_nram, (char*)bit_cond_nram, + __bang_bnot((int8_t*)bit_cond_reverse_nram, (int8_t*)bit_cond_nram, 4 * bit_cond_stride); __bang_gt_bitindex((T*)(bit_cond_nram + bit_cond_stride_4), cond_nram + nq_nl_np_4, tmp_zero, nq_nl_np); - __bang_bnot((char*)(bit_cond_reverse_nram + bit_cond_stride_4), - (char*)(bit_cond_nram + bit_cond_stride_4), bit_cond_stride); + __bang_bnot((int8_t*)(bit_cond_reverse_nram + bit_cond_stride_4), + (int8_t*)(bit_cond_nram + bit_cond_stride_4), + bit_cond_stride); } else { for (int j = 0; j < 5; j++) { __bang_gt_bitindex((T*)((int8_t*)bit_cond_nram + j * bit_cond_stride), cond_nram + j * nq_nl_np, tmp_zero, nq_nl_np_pad8); - __bang_bnot((char*)bit_cond_reverse_nram + j * bit_cond_stride, - (char*)bit_cond_nram + j * bit_cond_stride, + __bang_bnot((int8_t*)bit_cond_reverse_nram + j * bit_cond_stride, + (int8_t*)bit_cond_nram + j * bit_cond_stride, bit_cond_stride); } } @@ -281,7 +282,7 @@ __mlu_func__ void backwardStageTwoLoop( nq_nl_np, 0); __bang_mul_scalar((int32_t*)v_pong, (int32_t*)v_pong, (int32_t)0xffffffff, nq_nl_np); - __bang_band((char*)buffer, (char*)buffer, (char*)v_pong, + __bang_band((int8_t*)buffer, (int8_t*)buffer, (int8_t*)v_pong, nq_nl_np * sizeof(T)); // (nq, nl, np) => (Nq, nl, np) __sync_compute(); diff --git a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu index 1237cfc53..517c00a8c 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu +++ b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu @@ -26,7 +26,7 @@ #include "kernels/kernel.h" #include "kernels/utils/common.h" -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; #define ALIGN_NUM 32 @@ -46,7 +46,6 @@ void __mlu_func__ computeGridMaskAndOffset( const int32_t &num_points, const int32_t &w_stride, const int32_t &qid_stride, float *grad_temp1) { // [num_levels, 2] --> [2, num_levels] -#if __BANG_ARCH__ >= 372 __bang_transpose(nram_grad_output_tl, nram_loc_w, num_deal_grid, 2); // 2 * xhlp __bang_transpose(nram_loc_w, nram_grad_output_tl, @@ -275,7 +274,6 @@ void __mlu_func__ computeGridMaskAndOffset( __bang_mul(nram_w2, nram_hh, nram_lw, num_deal_grid); __bang_mul(nram_w3, nram_lh, nram_hw, num_deal_grid); __bang_mul(nram_w4, nram_lh, nram_lw, num_deal_grid); -#endif } void __mlu_func__ loadValue( @@ -290,7 +288,6 @@ void __mlu_func__ loadValue( const int32_t &num_query, const int32_t &num_levels, const int32_t &num_points, const int32_t &grid_offset, const int32_t &spatial_size, const int32_t &qid_stride) { -#if __BANG_ARCH__ >= 372 int32_t value_offset_temp = 0; #if __BANG_ARCH__ >= 592 @@ -378,24 +375,24 @@ void __mlu_func__ loadValue( __nram__ int32_t table[64] = {0, (int32_t)0xffffffff}; __bang_float2int32((int32_t *)grad_temp3, grad_temp3, num_deal_grid * deal_num_real, 0); - __bang_lut_s32((int32_t *)grad_temp3, (int32_t *)grad_temp3, (int32_t *)table, - num_deal_grid * deal_num_real, 64); + __bang_lut((int32_t *)grad_temp3, (uint32_t *)grad_temp3, (int32_t *)table, + num_deal_grid * deal_num_real, 64); __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid); __bang_cycle_add(grad_temp1, grad_temp1, mask1, deal_num_real * num_deal_grid, num_deal_grid); __sync_io_move_compute(); - __bang_band((char *)nram_grad_output_tl, (char *)nram_grad_output_tl, - (char *)grad_temp3, + __bang_band((int8_t *)nram_grad_output_tl, (int8_t *)nram_grad_output_tl, + (int8_t *)grad_temp3, num_deal_grid * deal_num_real * sizeof(float)); __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid); __bang_float2int32((int32_t *)grad_temp3, grad_temp3, num_deal_grid * deal_num_real, 0); - __bang_lut_s32((int32_t *)grad_temp3, (int32_t *)grad_temp3, (int32_t *)table, - num_deal_grid * deal_num_real, 64); - __bang_band((char *)nram_grad_output_tr, (char *)nram_grad_output_tr, - (char *)grad_temp3, + __bang_lut((int32_t *)grad_temp3, (uint32_t *)grad_temp3, (int32_t *)table, + num_deal_grid * deal_num_real, 64); + __bang_band((int8_t *)nram_grad_output_tr, (int8_t *)nram_grad_output_tr, + (int8_t *)grad_temp3, num_deal_grid * deal_num_real * sizeof(float)); __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid); @@ -405,10 +402,10 @@ void __mlu_func__ loadValue( __bang_float2int32((int32_t *)grad_temp3, grad_temp3, num_deal_grid * deal_num_real, 0); - __bang_lut_s32((int32_t *)grad_temp3, (int32_t *)grad_temp3, (int32_t *)table, - num_deal_grid * deal_num_real, 64); - __bang_band((char *)nram_grad_output_bl, (char *)nram_grad_output_bl, - (char *)grad_temp3, + __bang_lut((int32_t *)grad_temp3, (uint32_t *)grad_temp3, (int32_t *)table, + num_deal_grid * deal_num_real, 64); + __bang_band((int8_t *)nram_grad_output_bl, (int8_t *)nram_grad_output_bl, + (int8_t *)grad_temp3, num_deal_grid * deal_num_real * sizeof(float)); __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid); @@ -418,12 +415,11 @@ void __mlu_func__ loadValue( __bang_float2int32((int32_t *)grad_temp3, grad_temp3, num_deal_grid * deal_num_real, 0); - __bang_lut_s32((int32_t *)grad_temp3, (int32_t *)grad_temp3, (int32_t *)table, - num_deal_grid * deal_num_real, 64); - __bang_band((char *)nram_grad_output_br, (char *)nram_grad_output_br, - (char *)grad_temp3, + __bang_lut((int32_t *)grad_temp3, (uint32_t *)grad_temp3, (int32_t *)table, + num_deal_grid * deal_num_real, 64); + __bang_band((int8_t *)nram_grad_output_br, (int8_t *)nram_grad_output_br, + (int8_t *)grad_temp3, num_deal_grid * deal_num_real * sizeof(float)); -#endif } void __mlu_func__ computeGradValue( @@ -440,7 +436,6 @@ void __mlu_func__ computeGradValue( float *nram_grid_offset2, const int32_t &batch, float *nram_grad_output_tl, float *nram_grad_output_tr, float *nram_grad_output_bl, float *nram_grad_output_br, float *nram_grad_weight) { -#if __BANG_ARCH__ >= 372 __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid); __bang_cycle_add(grad_temp1, grad_temp1, nram_grad_weight, deal_num_real * num_deal_grid, num_deal_grid); @@ -590,7 +585,6 @@ void __mlu_func__ computeGradValue( } } #endif -#endif } void __mlu_func__ computeGradAttnWeight( @@ -606,7 +600,6 @@ void __mlu_func__ computeGradAttnWeight( const int32_t &grid_offset, float *nram_h_high_temp) { __bang_write_zero(grad_w_weight, 2 * offset_nram); // grad_output_nram_tl -#if __BANG_ARCH__ >= 372 __bang_transpose(grad_weight, nram_grad_output_tl, num_deal_grid, deal_num_real); __bang_cycle_mul(nram_grad_output_tl, grad_weight, nram_hw, @@ -690,13 +683,12 @@ void __mlu_func__ computeGradAttnWeight( __bang_float2int32((int32_t *)nram_h_high_temp, nram_h_high_temp, num_deal_grid, 0); __nram__ int32_t table[64] = {0, (int32_t)0xffffffff}; - __bang_lut_s32((int32_t *)nram_h_high_temp, (int32_t *)nram_h_high_temp, - (int32_t *)table, num_deal_grid, 64); - __bang_band((char *)nram_grad_output_tr, (char *)nram_grad_output_tr, - (char *)nram_h_high_temp, num_deal_grid * sizeof(float)); + __bang_lut((int32_t *)nram_h_high_temp, (uint32_t *)nram_h_high_temp, + (int32_t *)table, num_deal_grid, 64); + __bang_band((int8_t *)nram_grad_output_tr, (int8_t *)nram_grad_output_tr, + (int8_t *)nram_h_high_temp, num_deal_grid * sizeof(float)); __bang_atomic_reduce_add((float *)grad_attn_weight + grid_offset, (float *)nram_grad_output_tr, num_deal_grid); -#endif } void __mlu_func__ computeGradSampingLoc( @@ -708,7 +700,6 @@ void __mlu_func__ computeGradSampingLoc( const int32_t &num_heads, const int32_t &num_levels, const int32_t &num_points, const int32_t &grid_offset, float *nram_h_high_temp) { -#if __BANG_ARCH__ >= 372 __bang_add_scalar((float *)nram_spatial_shapes, (float *)nram_spatial_shapes, 1.0, 2 * num_levels); __bang_transpose(nram_grad_output_tl, grad_h_weight, @@ -746,10 +737,10 @@ void __mlu_func__ computeGradSampingLoc( ALIGN_NUM); __nram__ int32_t table[64] = {0, (int32_t)0xffffffff}; - __bang_lut_s32((int32_t *)nram_h_high_temp, (int32_t *)nram_h_high_temp, - (int32_t *)table, num_deal_grid, 64); - __bang_band((char *)grad_h_weight, (char *)grad_h_weight, - (char *)nram_h_high_temp, num_deal_grid * sizeof(float)); + __bang_lut((int32_t *)nram_h_high_temp, (uint32_t *)nram_h_high_temp, + (int32_t *)table, num_deal_grid, 64); + __bang_band((int8_t *)grad_h_weight, (int8_t *)grad_h_weight, + (int8_t *)nram_h_high_temp, num_deal_grid * sizeof(float)); __bang_transpose(nram_grad_output_tl, grad_w_weight, num_per_time_real * num_heads * num_levels * deal_num_real, @@ -769,17 +760,16 @@ void __mlu_func__ computeGradSampingLoc( num_deal_grid * deal_num_real * sizeof(float), NRAM2NRAM); __mluop_recursive_sum_pool(grad_w_weight, num_deal_grid, deal_num_real, ALIGN_NUM); - __bang_lut_s32((int32_t *)nram_h_high_temp, (int32_t *)nram_h_high_temp, - (int32_t *)table, num_deal_grid, 64); - __bang_band((char *)grad_w_weight, (char *)grad_w_weight, - (char *)nram_h_high_temp, num_deal_grid * sizeof(float)); + __bang_lut((int32_t *)nram_h_high_temp, (uint32_t *)nram_h_high_temp, + (int32_t *)table, num_deal_grid, 64); + __bang_band((int8_t *)grad_w_weight, (int8_t *)grad_w_weight, + (int8_t *)nram_h_high_temp, num_deal_grid * sizeof(float)); __memcpy_async(grad_w_weight + num_deal_grid, grad_h_weight, num_deal_grid * sizeof(float), NRAM2NRAM); __bang_transpose(nram_grad_output_tl, grad_w_weight, 2, num_deal_grid); __bang_atomic_reduce_add((float *)grad_sampling_loc + grid_offset * 2, (float *)nram_grad_output_tl, 2 * num_deal_grid); -#endif } __mlu_global__ void MLUUnion1KernelMsDeformAttnBackwardSmallChannelsKernel( diff --git a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_union1.mlu b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_union1.mlu index d70448a51..66505f6c0 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_union1.mlu +++ b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_union1.mlu @@ -26,7 +26,7 @@ #include "kernels/kernel.h" #include "kernels/utils/common.h" -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; #define likely(x) __builtin_expect((x), 1) #define ALIGN_NUM 64 @@ -46,7 +46,6 @@ void __mlu_func__ msDeformAttnCol2imBilinear( T *grad_sampling_loc, T *grad_attn_weight, T *grad_output_nram_temp, const int32_t &deal_num, const int32_t &deal_num_real, const T *data_value_ptr) { -#if __BANG_ARCH__ >= 372 if (h_low >= 0 && w_low >= 0) { int32_t offset1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; __memcpy(grad_output_nram, data_value_ptr + offset1, @@ -137,7 +136,6 @@ void __mlu_func__ msDeformAttnCol2imBilinear( __mluop_recursive_sum_pool(grad_h_weight, 1, deal_num_real, ALIGN_NUM_FOR_REDUCE); __bang_atomic_reduce_add((T *)(grad_sampling_loc + 1), (T *)grad_h_weight, 1); -#endif } __mlu_global__ void MLUUnion1KernelMsDeformAttnBackwardDefault( diff --git a/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_forward.h b/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_forward.h index 942601345..6b76faab5 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_forward.h +++ b/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_forward.h @@ -31,29 +31,29 @@ template __mlu_global__ void MLUKernelMsDeformAttnForwardDefault( - const char *data_value_gdram, const char *data_spatial_shapes_gdram, - const char *data_level_start_index_gdram, - const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram, + const int8_t *data_value_gdram, const int8_t *data_spatial_shapes_gdram, + const int8_t *data_level_start_index_gdram, + const int8_t *data_sampling_loc_gdram, const int8_t *data_attn_weight_gdram, const int batch_size, const int num_keys, const int num_heads, const int channels, const int num_levels, const int num_queries, - const int num_points, char *data_col_gdram); + const int num_points, int8_t *data_col_gdram); template __mlu_global__ void MLUKernelMsDeformAttnForwardSmallChannel( - const char *data_value_gdram, const char *data_spatial_shapes_gdram, - const char *data_level_start_index_gdram, - const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram, + const int8_t *data_value_gdram, const int8_t *data_spatial_shapes_gdram, + const int8_t *data_level_start_index_gdram, + const int8_t *data_sampling_loc_gdram, const int8_t *data_attn_weight_gdram, const int batch_size, const int num_keys, const int num_heads, const int channels, const int num_levels, const int num_queries, - const int num_points, char *data_col_gdram); + const int num_points, int8_t *data_col_gdram); template __mlu_global__ void MLUKernelMsDeformAttnForwardFast( - const char *data_value_gdram, const char *data_spatial_shapes_gdram, - const char *data_level_start_index_gdram, - const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram, + const int8_t *data_value_gdram, const int8_t *data_spatial_shapes_gdram, + const int8_t *data_level_start_index_gdram, + const int8_t *data_sampling_loc_gdram, const int8_t *data_attn_weight_gdram, const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, const int32_t channels, const int32_t num_levels, const int32_t num_queries, - const int32_t num_points, char *data_col_gdram); + const int32_t num_points, int8_t *data_col_gdram); #endif // KERNELS_MS_DEFORM_ATTN_FORWARD_MS_DEFORM_ATTN_FORWARD_H_ diff --git a/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_forward.mlu b/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_forward.mlu index 7b803d4e2..2f4906d97 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_forward.mlu +++ b/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_forward.mlu @@ -56,7 +56,7 @@ MsDeformAttnForwardPolicy msDeformAttnForwardPolicyFunc( mluop::runtime::getClusterLimitCapability(handle)); k_dims->z = 1; - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; int32_t nlp = num_levels * num_points; int32_t nlpc = num_levels * num_points * channels; @@ -274,32 +274,32 @@ mluOpStatus_t MLUOP_WIN_API mluOpMsDeformAttnForward( VLOG(5) << "Not Implemented"; break; } - case CNRT_FUNC_TYPE_BLOCK: { + case cnrtFuncTypeBlock: { VLOG(5) << "Launch Kernel MLUKernelMsDeformAttnForwardDefault<<>>"; KERNEL_CHECK( (MLUKernelMsDeformAttnForwardDefault <<queue>>>( - (char *)data_value, (char *)data_spatial_shapes, - (char *)data_level_start_index, (char *)data_sampling_loc, - (char *)data_attn_weight, batch_size, num_keys, num_heads, - channels, num_levels, num_queries, num_points, - (char *)data_col))); + (int8_t *)data_value, (int8_t *)data_spatial_shapes, + (int8_t *)data_level_start_index, + (int8_t *)data_sampling_loc, (int8_t *)data_attn_weight, + batch_size, num_keys, num_heads, channels, num_levels, + num_queries, num_points, (int8_t *)data_col))); break; } - case CNRT_FUNC_TYPE_UNION1: { + case cnrtFuncTypeUnion1: { VLOG(5) << "Launch Kernel MLUKernelMsDeformAttnForwardDefault<<>>"; KERNEL_CHECK( (MLUKernelMsDeformAttnForwardDefault <<queue>>>( - (char *)data_value, (char *)data_spatial_shapes, - (char *)data_level_start_index, (char *)data_sampling_loc, - (char *)data_attn_weight, batch_size, num_keys, num_heads, - channels, num_levels, num_queries, num_points, - (char *)data_col))); + (int8_t *)data_value, (int8_t *)data_spatial_shapes, + (int8_t *)data_level_start_index, + (int8_t *)data_sampling_loc, (int8_t *)data_attn_weight, + batch_size, num_keys, num_heads, channels, num_levels, + num_queries, num_points, (int8_t *)data_col))); break; } } @@ -311,21 +311,21 @@ mluOpStatus_t MLUOP_WIN_API mluOpMsDeformAttnForward( VLOG(5) << "Not Implemented"; break; } - case CNRT_FUNC_TYPE_BLOCK: { + case cnrtFuncTypeBlock: { VLOG(5) << "Launch Kernel " "MLUKernelMsDeformAttnForwardSmallChannel<<>>"; KERNEL_CHECK( (MLUKernelMsDeformAttnForwardSmallChannel <<queue>>>( - (char *)data_value, (char *)data_spatial_shapes, - (char *)data_level_start_index, (char *)data_sampling_loc, - (char *)data_attn_weight, batch_size, num_keys, num_heads, - channels, num_levels, num_queries, num_points, - (char *)data_col))); + (int8_t *)data_value, (int8_t *)data_spatial_shapes, + (int8_t *)data_level_start_index, + (int8_t *)data_sampling_loc, (int8_t *)data_attn_weight, + batch_size, num_keys, num_heads, channels, num_levels, + num_queries, num_points, (int8_t *)data_col))); break; } - case CNRT_FUNC_TYPE_UNION1: { + case cnrtFuncTypeUnion1: { VLOG(5) << "Launch Kernel " "MLUKernelMsDeformAttnForwardSmallChannel<< <<queue>>>( - (char *)data_value, (char *)data_spatial_shapes, - (char *)data_level_start_index, (char *)data_sampling_loc, - (char *)data_attn_weight, batch_size, num_keys, num_heads, - channels, num_levels, num_queries, num_points, - (char *)data_col))); + (int8_t *)data_value, (int8_t *)data_spatial_shapes, + (int8_t *)data_level_start_index, + (int8_t *)data_sampling_loc, (int8_t *)data_attn_weight, + batch_size, num_keys, num_heads, channels, num_levels, + num_queries, num_points, (int8_t *)data_col))); break; } } @@ -349,11 +349,11 @@ mluOpStatus_t MLUOP_WIN_API mluOpMsDeformAttnForward( << ", " << k_dims.z << ">>>"; KERNEL_CHECK((MLUKernelMsDeformAttnForwardFast <<queue>>>( - (char *)data_value, (char *)data_spatial_shapes, - (char *)data_level_start_index, - (char *)data_sampling_loc, (char *)data_attn_weight, + (int8_t *)data_value, (int8_t *)data_spatial_shapes, + (int8_t *)data_level_start_index, + (int8_t *)data_sampling_loc, (int8_t *)data_attn_weight, batch_size, num_keys, num_heads, channels, num_levels, - num_queries, num_points, (char *)data_col))); + num_queries, num_points, (int8_t *)data_col))); break; } } diff --git a/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_utils.h b/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_utils.h index da907e369..122d2d35e 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_utils.h +++ b/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_utils.h @@ -32,8 +32,6 @@ #define BIT_COLLECT_PAD (8) #define BACKWARD_MAX_NQ_NL_NP (1024) -#if (__BANG_ARCH__ >= 372) - __mlu_func__ void broadcastSpatialHW( float* spatial_offset_bd_nram, // (num_levels, num_points) float* spatial_h_bd_nram, // (num_levels, num_points) @@ -80,8 +78,8 @@ __mlu_func__ void prepareLoopV2( __bang_add_scalar(seq_nram + 512, seq_nram, 512, 512); // [0, 511] + 512 } __bang_write_value(zeros_nram, channels, (T)0); - __bang_write_value(mask_x_nram, mask_size, (char)0x55); - __bang_write_value(mask_y_nram, mask_size, (char)0xAA); + __bang_write_value(mask_x_nram, mask_size, (int8_t)0x55); + __bang_write_value(mask_y_nram, mask_size, (int8_t)0xAA); __memcpy_async(spatial_offset_nram, data_level_start_index_gdram, num_levels * sizeof(int32_t), GDRAM2NRAM); __memcpy_async(spatial_hw_nram, data_spatial_shapes_gdram, @@ -304,8 +302,8 @@ __mlu_func__ void computePolationWeightOffsetCond( __bang_mul_scalar((int32_t*)cond_point_polation_nram_tmp, (int32_t*)cond_point_polation_nram_tmp, (int32_t)0xffffffff, total_points * 4); - __bang_band((char*)weight_polation_nram, (char*)weight_polation_nram, - (char*)cond_point_polation_nram_tmp, + __bang_band((int8_t*)weight_polation_nram, (int8_t*)weight_polation_nram, + (int8_t*)cond_point_polation_nram_tmp, total_points * 4 * sizeof(float)); } @@ -368,7 +366,8 @@ __mlu_func__ void stageOneLoop( __bang_mul_scalar((int32_t*)cond_point_valid_nram, (int32_t*)cond_point_valid_nram, (int32_t)0xffffffff, deal_point_num); - __bang_band((char*)buf_nram, (char*)buf_nram, (char*)cond_point_valid_nram, + __bang_band((int8_t*)buf_nram, (int8_t*)buf_nram, + (int8_t*)cond_point_valid_nram, deal_n * num_levels * num_points * sizeof(T)); __memcpy(weight_attn_sram + sram_offset, buf_nram, copy_size, NRAM2SRAM); sram_offset += deal_point_num; @@ -394,5 +393,3 @@ __mlu_func__ void gatherSync(void* dst, void* src, unsigned int* offset, transfer_num); } #endif - -#endif diff --git a/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_fast_union1.mlu b/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_fast_union1.mlu index c5f2ddcc7..b21af0a0e 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_fast_union1.mlu +++ b/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_fast_union1.mlu @@ -24,8 +24,6 @@ #pragma bang walign(64) -#if (__BANG_ARCH__ >= 372) - #define MAX_MEMCPY_SEGNUM (65536) #define NRAM_REMAIN_SIZE (48 * 1024) #define SRAM_REMAIN_SIZE (32 * 1024) @@ -46,9 +44,9 @@ #define WRAM_ALIGN_SIZE (64) #endif -__nram__ char nram_buffer[NRAM_AVALIABLE_SIZE]; -__mlu_shared__ char sram_buffer[SRAM_AVALIABLE_SIZE]; -__wram__ char wram_buffer[WRAM_AVALIABLE_SIZE]; +__nram__ int8_t nram_buffer[NRAM_AVALIABLE_SIZE]; +__mlu_shared__ int8_t sram_buffer[SRAM_AVALIABLE_SIZE]; +__wram__ int8_t wram_buffer[WRAM_AVALIABLE_SIZE]; template __mlu_func__ void tileWeight2WramAsync(T* dst, @@ -298,8 +296,8 @@ __mlu_func__ void getConditionCoordWeight( __bang_mul_scalar((int32_t*)cond_point_polation_nram, (int32_t*)cond_point_polation_nram, (int32_t)0xffffffff, total_points * 4); - __bang_band((char*)weight_polation_nram_tmp, (char*)weight_polation_nram, - (char*)cond_point_polation_nram, + __bang_band((int8_t*)weight_polation_nram_tmp, (int8_t*)weight_polation_nram, + (int8_t*)cond_point_polation_nram, total_points * 4 * sizeof(float)); __bang_filter((float*)weight_polation_nram, (float*)weight_polation_nram_tmp, cond_point_valid_nram, total_points); @@ -404,8 +402,8 @@ __mlu_func__ void reduceLevelByConv( __sync_move(); if (value_contain_infnan) { - __bang_cycle_band((char*)input_trans, (char*)input_trans, - (char*)cond_compute, co * ci * sizeof(T), + __bang_cycle_band((int8_t*)input_trans, (int8_t*)input_trans, + (int8_t*)cond_compute, co * ci * sizeof(T), ci * sizeof(T)); } @@ -593,16 +591,16 @@ __mlu_func__ void prepareLoop( T* ones_nram, int32_t* spatial_offset_nram, int32_t* spatial_hw_nram, int8_t* mask_x_nram, int8_t* mask_y_nram, T* spatial_offset_bd_nram, T* spatial_h_bd_nram, T* spatial_w_bd_nram, T* value_sram, - const char* data_level_start_index_gdram, - const char* data_spatial_shapes_gdram, const int32_t num_keys, + const int8_t* data_level_start_index_gdram, + const int8_t* data_spatial_shapes_gdram, const int32_t num_keys, const int32_t num_levels, const int32_t num_points, const int32_t max_deal_n, const int32_t mask_size, const int32_t channels) { int32_t pad_num_points_levels = PAD_UP(num_levels * num_points, WRAM_ALIGN_SIZE / sizeof(T)); __bang_write_value(ones_nram, pad_num_points_levels, (T)0); __bang_write_value(ones_nram, num_levels * num_points, (T)1); - __bang_write_value(mask_x_nram, mask_size, (char)0x55); - __bang_write_value(mask_y_nram, mask_size, (char)0xAA); + __bang_write_value(mask_x_nram, mask_size, (int8_t)0x55); + __bang_write_value(mask_y_nram, mask_size, (int8_t)0xAA); __memcpy_async(spatial_offset_nram, data_level_start_index_gdram, num_levels * sizeof(int32_t), GDRAM2NRAM); __memcpy_async(spatial_hw_nram, data_spatial_shapes_gdram, @@ -730,12 +728,12 @@ __mlu_func__ void memPolicyCommon( template __mlu_func__ void MLUKernelMsDeformAttnForwardFastImpl( - const char* data_value_gdram, const char* data_spatial_shapes_gdram, - const char* data_level_start_index_gdram, - const char* data_sampling_loc_gdram, const char* data_attn_weight_gdram, + const int8_t* data_value_gdram, const int8_t* data_spatial_shapes_gdram, + const int8_t* data_level_start_index_gdram, + const int8_t* data_sampling_loc_gdram, const int8_t* data_attn_weight_gdram, const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, const int32_t channels, const int32_t num_levels, const int32_t num_queries, - const int32_t num_points, char* data_col_gdram) { + const int32_t num_points, int8_t* data_col_gdram) { int32_t input_stride_4 = num_queries * num_heads * num_levels * num_points; int32_t input_stride_3 = num_heads * num_levels * num_points; int32_t input_stride_2 = num_levels * num_points; @@ -827,7 +825,7 @@ __mlu_func__ void MLUKernelMsDeformAttnForwardFastImpl( if (__is_ipu()) { if (sram_stay) { int32_t buf_size = - (int)((char*)buf_nram_end - (char*)value_output_nram); + (int)((int8_t*)buf_nram_end - (int8_t*)value_output_nram); isValueContainInfNan(value_sram, value_sram + num_keys * channels, value_output_nram, value_contain_infnan, buf_size, num_keys * channels); @@ -907,7 +905,6 @@ __mlu_func__ void MLUKernelMsDeformAttnForwardFastImpl( __sync_cluster(); } } -#endif #if (__BANG_ARCH__ == 592) @@ -943,8 +940,8 @@ __mlu_func__ void memPolicy590( T*& value_pong, T*& compute_buffer, T*& weight_polation_nram_stg2, T*& weight_attn_nram_stg2, int32_t*& offset_nram_stg2, T*& output_nram, T*& cond_nram_stg2, int32_t*& data_offset_sram, T*& weight_polation_sram, - T*& weight_attn_sram, T*& cond_point_polation_sram, char* nram_buffer, - char* sram_buffer, int32_t& max_cached_n, int32_t& stage_1_max_deal_n, + T*& weight_attn_sram, T*& cond_point_polation_sram, int8_t* nram_buffer, + int8_t* sram_buffer, int32_t& max_cached_n, int32_t& stage_1_max_deal_n, int32_t& stage_2_max_deal_n, int32_t& mask_size, const int32_t nram_avaliable_size, const int32_t sram_avaliable_size, const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, @@ -1064,7 +1061,7 @@ __mlu_func__ void forwardStageTwoLoop( __sync_move(); __bang_gt_bitindex(cond_nram_stg2, cond_nram_stg2, compute_buffer_nram, total_point_pad_8); - __bang_bnot((char*)cond_nram_stg2_reverse, (char*)cond_nram_stg2, + __bang_bnot((int8_t*)cond_nram_stg2_reverse, (int8_t*)cond_nram_stg2, gather_mask_size); } @@ -1102,12 +1099,12 @@ __mlu_func__ void forwardStageTwoLoop( // only for 590 template __mlu_func__ void MLUKernelMsDeformAttnForwardFastImpl( - const char* data_value_gdram, const char* data_spatial_shapes_gdram, - const char* data_level_start_index_gdram, - const char* data_sampling_loc_gdram, const char* data_attn_weight_gdram, + const int8_t* data_value_gdram, const int8_t* data_spatial_shapes_gdram, + const int8_t* data_level_start_index_gdram, + const int8_t* data_sampling_loc_gdram, const int8_t* data_attn_weight_gdram, const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, const int32_t channels, const int32_t num_levels, const int32_t num_queries, - const int32_t num_points, char* data_col_gdram) { + const int32_t num_points, int8_t* data_col_gdram) { int32_t input_stride_4 = num_queries * num_heads * num_levels * num_points; int32_t input_stride_3 = num_heads * num_levels * num_points; int32_t input_stride_2 = num_levels * num_points; @@ -1240,12 +1237,12 @@ __mlu_func__ void MLUKernelMsDeformAttnForwardFastImpl( template __mlu_global__ void MLUKernelMsDeformAttnForwardFast( - const char* data_value_gdram, const char* data_spatial_shapes_gdram, - const char* data_level_start_index_gdram, - const char* data_sampling_loc_gdram, const char* data_attn_weight_gdram, + const int8_t* data_value_gdram, const int8_t* data_spatial_shapes_gdram, + const int8_t* data_level_start_index_gdram, + const int8_t* data_sampling_loc_gdram, const int8_t* data_attn_weight_gdram, const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, const int32_t channels, const int32_t num_levels, const int32_t num_queries, - const int32_t num_points, char* data_col_gdram) { + const int32_t num_points, int8_t* data_col_gdram) { #if (__BANG_ARCH__ == 372) size_t single_value_size = num_keys * channels * sizeof(T); if (single_value_size <= SRAM_FOR_VALUE_SIZE) { @@ -1272,9 +1269,9 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardFast( } template __mlu_global__ void MLUKernelMsDeformAttnForwardFast( - const char* data_value_gdram, const char* data_spatial_shapes_gdram, - const char* data_level_start_index_gdram, - const char* data_sampling_loc_gdram, const char* data_attn_weight_gdram, + const int8_t* data_value_gdram, const int8_t* data_spatial_shapes_gdram, + const int8_t* data_level_start_index_gdram, + const int8_t* data_sampling_loc_gdram, const int8_t* data_attn_weight_gdram, const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, const int32_t channels, const int32_t num_levels, const int32_t num_queries, - const int32_t num_points, char* data_col_gdram); + const int32_t num_points, int8_t* data_col_gdram); diff --git a/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu b/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu index 7028a9c98..e1abf634d 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu +++ b/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu @@ -26,10 +26,9 @@ #define ELE_COUNT 32 /* cycle element count */ -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; __mlu_func__ void genMask0101(float *mask_ram, int32_t size) { -#if __BANG_ARCH__ >= 372 int32_t align_num = NFU_ALIGN_SIZE / sizeof(float); for (int32_t i = 0; i < align_num; ++i) { mask_ram[i] = i % 2; @@ -40,18 +39,16 @@ __mlu_func__ void genMask0101(float *mask_ram, int32_t size) { __memcpy(mask_ram + align_num, mask_ram, NFU_ALIGN_SIZE, NRAM2NRAM, NFU_ALIGN_SIZE, 0, (size / align_num + (size % align_num > 0)) - 2); __sync(); -#endif } template __mlu_global__ void MLUKernelMsDeformAttnForwardSmallChannel( - const char *data_value_gdram, const char *data_spatial_shapes_gdram, - const char *data_level_start_index_gdram, - const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram, + const int8_t *data_value_gdram, const int8_t *data_spatial_shapes_gdram, + const int8_t *data_level_start_index_gdram, + const int8_t *data_sampling_loc_gdram, const int8_t *data_attn_weight_gdram, const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, const int32_t channels, const int32_t num_levels, const int32_t num_queries, - const int32_t num_points, char *data_col_gdram) { -#if __BANG_ARCH__ >= 372 + const int32_t num_points, int8_t *data_col_gdram) { if (__is_mpu()) { return; } @@ -109,44 +106,44 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardSmallChannel( const int32_t g_rep = deal_g / deal_num; const int32_t g_rem = deal_g % deal_num; // nram buffer alloc - char *data_spatial_shapes_nram = nram_buffer; - char *data_level_start_index_nram = data_spatial_shapes_nram + spatial_size; - char *input_tl = data_level_start_index_nram + level_start_index_size; - char *input_tr = input_tl + deal_num * mult * sizeof(T); - char *input_bl = input_tr + deal_num * mult * sizeof(T); - char *input_br = input_bl + deal_num * mult * sizeof(T); - char *weight_tl = input_tl + 4 * deal_num * mult * sizeof(T); - char *weight_tr = weight_tl + deal_num * mult * sizeof(T); - char *weight_bl = weight_tr + deal_num * mult * sizeof(T); - char *weight_br = weight_bl + deal_num * mult * sizeof(T); - char *mask_tl = weight_br + deal_num * mult * sizeof(T); - char *mask_tr = mask_tl + deal_num * sizeof(T); - char *mask_bl = mask_tr + deal_num * sizeof(T); - char *mask_br = mask_bl + deal_num * sizeof(T); - char *point_ram = mask_br + deal_num * sizeof(T); - char *index_tl = point_ram + deal_num * sizeof(T); - char *index_bl = index_tl + deal_num * sizeof(T); - char *valid_mask = index_bl + deal_num * sizeof(T); + int8_t *data_spatial_shapes_nram = nram_buffer; + int8_t *data_level_start_index_nram = data_spatial_shapes_nram + spatial_size; + int8_t *input_tl = data_level_start_index_nram + level_start_index_size; + int8_t *input_tr = input_tl + deal_num * mult * sizeof(T); + int8_t *input_bl = input_tr + deal_num * mult * sizeof(T); + int8_t *input_br = input_bl + deal_num * mult * sizeof(T); + int8_t *weight_tl = input_tl + 4 * deal_num * mult * sizeof(T); + int8_t *weight_tr = weight_tl + deal_num * mult * sizeof(T); + int8_t *weight_bl = weight_tr + deal_num * mult * sizeof(T); + int8_t *weight_br = weight_bl + deal_num * mult * sizeof(T); + int8_t *mask_tl = weight_br + deal_num * mult * sizeof(T); + int8_t *mask_tr = mask_tl + deal_num * sizeof(T); + int8_t *mask_bl = mask_tr + deal_num * sizeof(T); + int8_t *mask_br = mask_bl + deal_num * sizeof(T); + int8_t *point_ram = mask_br + deal_num * sizeof(T); + int8_t *index_tl = point_ram + deal_num * sizeof(T); + int8_t *index_bl = index_tl + deal_num * sizeof(T); + int8_t *valid_mask = index_bl + deal_num * sizeof(T); // nram space reuse - char *grid_ram = weight_tl; - char *mask_ram = weight_bl; - char *coord_x = input_bl; - char *coord_y = coord_x + deal_num * sizeof(T); - char *coord_x_low = input_tl; - char *coord_y_low = coord_x_low + deal_num * sizeof(T); - char *coord_x_low_int = weight_tl; - char *coord_y_low_int = weight_tr; - char *spatial_x = mask_tl; - char *spatial_y = mask_tr; - char *spatial_x_float = weight_bl; - char *spatial_y_float = weight_br; - char *spatial_x_temp = mask_bl; - char *spatial_y_temp = mask_br; + int8_t *grid_ram = weight_tl; + int8_t *mask_ram = weight_bl; + int8_t *coord_x = input_bl; + int8_t *coord_y = coord_x + deal_num * sizeof(T); + int8_t *coord_x_low = input_tl; + int8_t *coord_y_low = coord_x_low + deal_num * sizeof(T); + int8_t *coord_x_low_int = weight_tl; + int8_t *coord_y_low_int = weight_tr; + int8_t *spatial_x = mask_tl; + int8_t *spatial_y = mask_tr; + int8_t *spatial_x_float = weight_bl; + int8_t *spatial_y_float = weight_br; + int8_t *spatial_x_temp = mask_bl; + int8_t *spatial_y_temp = mask_br; #if MS_DEFORM_ATTN_FORWARD_HEADVECTOR - char *base_ptr_offset = weight_tl; + int8_t *base_ptr_offset = weight_tl; #endif - char *auxiliary_a = point_ram; - char *auxiliary_b = weight_bl; + int8_t *auxiliary_a = point_ram; + int8_t *auxiliary_b = weight_bl; __memcpy_async(data_spatial_shapes_nram, data_spatial_shapes_gdram, num_levels * 2 * sizeof(int32_t), GDRAM2NRAM); __memcpy_async(data_level_start_index_nram, data_level_start_index_gdram, @@ -165,7 +162,7 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardSmallChannel( io_data_num = g_rem; } } - char *data_col_gdram_start = + int8_t *data_col_gdram_start = data_col_gdram + (batch_idx * num_queries * num_heads * channels + (offset_g + grid_iter * deal_num) / (num_levels * num_points) * channels) * @@ -216,7 +213,7 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardSmallChannel( // generate valid mask, which means the location is nan/inf or not // condition coordx > -1 / coordy > -1 __bang_gt_scalar((float *)auxiliary_a, (float *)coord_x, -1.0, deal_num); - __bang_move((char *)valid_mask, (char *)auxiliary_a, + __bang_move((int8_t *)valid_mask, (int8_t *)auxiliary_a, deal_num * sizeof(float)); __bang_gt_scalar((float *)auxiliary_a, (float *)coord_y, -1.0, deal_num); __bang_add((float *)valid_mask, (float *)valid_mask, (float *)auxiliary_a, @@ -385,8 +382,9 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardSmallChannel( (float *)weight_tr, deal_num); // if loc has nan/inf, fill all invalid potision with 0. // Note that this operation handles in bit-scale. - __bang_cycle_band((char *)input_tl, (char *)input_tl, (char *)valid_mask, - 4 * deal_num * sizeof(float), deal_num * sizeof(float)); + __bang_cycle_band((int8_t *)input_tl, (int8_t *)input_tl, + (int8_t *)valid_mask, 4 * deal_num * sizeof(float), + deal_num * sizeof(float)); __sync(); // extend weight const int32_t w_rep = channel / ELE_COUNT * ELE_COUNT; @@ -406,7 +404,7 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardSmallChannel( } } __sync(); - const char *data_value_gdram_start = + const int8_t *data_value_gdram_start = data_value_gdram + batch_idx * num_keys * num_heads * channels * sizeof(float); const int32_t c_str = deal_num * channel * sizeof(float); @@ -431,7 +429,7 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardSmallChannel( ((int32_t *)data_level_start_index_nram)[(p_idx / num_points) % num_levels]; #if MS_DEFORM_ATTN_FORWARD_HEADVECTOR - const char *data_value_ptr = + const int8_t *data_value_ptr = data_value_gdram_start + (level_start_id * num_heads * channels + c_iter * channel) * sizeof(float); @@ -439,7 +437,7 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardSmallChannel( const int32_t head_idx = ((p_idx + offset_g + grid_iter * deal_num) / (num_levels * num_points)) % num_heads; - const char *data_value_ptr = + const int8_t *data_value_ptr = data_value_gdram_start + (level_start_id * num_heads * channels + head_idx * channels + c_iter * channel) * @@ -545,13 +543,12 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardSmallChannel( } __sync(); return; -#endif } template __mlu_global__ void MLUKernelMsDeformAttnForwardSmallChannel( - const char *data_value_gdram, const char *data_spatial_shapes_gdram, - const char *data_level_start_index_gdram, - const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram, + const int8_t *data_value_gdram, const int8_t *data_spatial_shapes_gdram, + const int8_t *data_level_start_index_gdram, + const int8_t *data_sampling_loc_gdram, const int8_t *data_attn_weight_gdram, const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, const int32_t channels, const int32_t num_levels, const int32_t num_queries, - const int32_t num_points, char *data_col_gdram); + const int32_t num_points, int8_t *data_col_gdram); diff --git a/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_union1_default.mlu b/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_union1_default.mlu index 0b3a30f30..4c4d525ca 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_union1_default.mlu +++ b/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_union1_default.mlu @@ -27,7 +27,7 @@ #define TWELVE_SPLIT 12 #define ELE_COUNT 32 /* cycle element count */ -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; template __mlu_func__ void loadNeighborPointsData( @@ -137,12 +137,12 @@ __mlu_func__ void computeMsDeformAttn( template __mlu_global__ void MLUKernelMsDeformAttnForwardDefault( - const char *data_value_gdram, const char *data_spatial_shapes_gdram, - const char *data_level_start_index_gdram, - const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram, + const int8_t *data_value_gdram, const int8_t *data_spatial_shapes_gdram, + const int8_t *data_level_start_index_gdram, + const int8_t *data_sampling_loc_gdram, const int8_t *data_attn_weight_gdram, const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, const int32_t channels, const int32_t num_levels, const int32_t num_queries, - const int32_t num_points, char *data_col_gdram) { + const int32_t num_points, int8_t *data_col_gdram) { if (__is_mpu()) { return; } @@ -154,28 +154,28 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault( const int32_t channels_seg_num = channels / span_num_deal; const size_t channels_rem = channels % span_num_deal; const size_t channels_align_rem = CEIL_ALIGN(channels_rem, align_num); - char *data_spatial_shapes_nram = nram_buffer; - char *ping_data_value_p1_nram = data_spatial_shapes_nram + spatial_size; - char *ping_data_value_p2_nram = + int8_t *data_spatial_shapes_nram = nram_buffer; + int8_t *ping_data_value_p1_nram = data_spatial_shapes_nram + spatial_size; + int8_t *ping_data_value_p2_nram = ping_data_value_p1_nram + span_num_deal * sizeof(T); - char *ping_data_value_p3_nram = + int8_t *ping_data_value_p3_nram = ping_data_value_p2_nram + span_num_deal * sizeof(T); - char *ping_data_value_p4_nram = + int8_t *ping_data_value_p4_nram = ping_data_value_p3_nram + span_num_deal * sizeof(T); - char *ping_data_col_nram = + int8_t *ping_data_col_nram = ping_data_value_p4_nram + span_num_deal * sizeof(T); - char *pong_data_value_p1_nram = + int8_t *pong_data_value_p1_nram = ping_data_col_nram + span_num_deal * sizeof(T); - char *pong_data_value_p2_nram = + int8_t *pong_data_value_p2_nram = pong_data_value_p1_nram + span_num_deal * sizeof(T); - char *pong_data_value_p3_nram = + int8_t *pong_data_value_p3_nram = pong_data_value_p2_nram + span_num_deal * sizeof(T); - char *pong_data_value_p4_nram = + int8_t *pong_data_value_p4_nram = pong_data_value_p3_nram + span_num_deal * sizeof(T); - char *pong_data_col_nram = + int8_t *pong_data_col_nram = pong_data_value_p4_nram + span_num_deal * sizeof(T); - char *auxiliary_a = pong_data_col_nram + span_num_deal * sizeof(T); - char *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T); + int8_t *auxiliary_a = pong_data_col_nram + span_num_deal * sizeof(T); + int8_t *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T); const size_t ping_pong_gap = 5 * span_num_deal * sizeof(T); size_t data_col_ping_pong_idx = 0; int32_t block_num_per_core = (batch_size * num_queries * num_heads) / taskDim; @@ -196,15 +196,15 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault( */ const int32_t head_idx = cur_idx % num_heads; const int32_t batch_idx = (cur_idx / num_heads) / num_queries; - const char *data_value_gdram_start = + const int8_t *data_value_gdram_start = data_value_gdram + batch_idx * num_keys * num_heads * channels * sizeof(T); - const char *data_sampling_loc_gdram_start = + const int8_t *data_sampling_loc_gdram_start = data_sampling_loc_gdram + cur_idx * num_levels * num_points * 2 * sizeof(T); - const char *data_attn_weight_gdram_start = + const int8_t *data_attn_weight_gdram_start = data_attn_weight_gdram + cur_idx * num_levels * num_points * sizeof(T); - char *data_col_gdram_start = + int8_t *data_col_gdram_start = data_col_gdram + cur_idx * channels * sizeof(T); for (int32_t c_seg_idx = 0; c_seg_idx < channels_seg_num; ++c_seg_idx) { __bang_write_value( @@ -216,7 +216,7 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault( 2 * sizeof(int32_t), GDRAM2NRAM); int32_t spatial_h = ((int32_t *)data_spatial_shapes_nram)[0]; int32_t spatial_w = ((int32_t *)data_spatial_shapes_nram)[1]; - const char *data_value_ptr = + const int8_t *data_value_ptr = data_value_gdram_start + c_seg_idx * span_num_deal * sizeof(T); T loc_w = ((T *)data_sampling_loc_gdram_start)[0]; T loc_h = ((T *)data_sampling_loc_gdram_start)[1]; @@ -361,7 +361,7 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault( 2 * sizeof(int32_t), GDRAM2NRAM); int32_t spatial_h = ((int32_t *)data_spatial_shapes_nram)[0]; int32_t spatial_w = ((int32_t *)data_spatial_shapes_nram)[1]; - const char *data_value_ptr = + const int8_t *data_value_ptr = data_value_gdram_start + channels_seg_num * span_num_deal * sizeof(T); T loc_w = ((T *)data_sampling_loc_gdram_start)[0]; T loc_h = ((T *)data_sampling_loc_gdram_start)[1]; @@ -502,9 +502,9 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault( } template __mlu_global__ void MLUKernelMsDeformAttnForwardDefault( - const char *data_value_gdram, const char *data_spatial_shapes_gdram, - const char *data_level_start_index_gdram, - const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram, + const int8_t *data_value_gdram, const int8_t *data_spatial_shapes_gdram, + const int8_t *data_level_start_index_gdram, + const int8_t *data_sampling_loc_gdram, const int8_t *data_attn_weight_gdram, const int32_t batch_size, const int32_t num_keys, const int32_t num_heads, const int32_t channels, const int32_t num_levels, const int32_t num_queries, - const int32_t num_points, char *data_col_gdram); + const int32_t num_points, int8_t *data_col_gdram); diff --git a/kernels/mutual_information/mutual_information_backward/mutual_information_backward.cpp b/kernels/mutual_information/mutual_information_backward/mutual_information_backward.cpp index 6f190b8e3..68e1e145e 100644 --- a/kernels/mutual_information/mutual_information_backward/mutual_information_backward.cpp +++ b/kernels/mutual_information/mutual_information_backward/mutual_information_backward.cpp @@ -452,7 +452,7 @@ static void policyFunc3Pipeline(const mluOpHandle_t handle, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type, int batch_size) { int core_num = mluop::runtime::getClusterLimitCapability(handle) * mluop::runtime::getCoreNumOfEachUnionCapability(handle); - *k_type = CNRT_FUNC_TYPE_BLOCK; + *k_type = cnrtFuncTypeBlock; k_dim->x = 1; k_dim->y = batch_size < core_num ? batch_size : core_num; k_dim->z = 1; @@ -776,7 +776,7 @@ static mluOpStatus_t launchMutualInformationBackwardDefaultKernel( int max_s_t_block_num = std::max(s_block_num, t_block_num); int min_s_t_block_num = std::min(s_block_num, t_block_num); - k_type = CNRT_FUNC_TYPE_BLOCK; + k_type = cnrtFuncTypeBlock; k_dim.y = 1; k_dim.z = 1; // Get current arch support max dim_x value diff --git a/kernels/mutual_information/mutual_information_backward/mutual_information_backward_utils.h b/kernels/mutual_information/mutual_information_backward/mutual_information_backward_utils.h index 19bbe306c..addc5e411 100644 --- a/kernels/mutual_information/mutual_information_backward/mutual_information_backward_utils.h +++ b/kernels/mutual_information/mutual_information_backward/mutual_information_backward_utils.h @@ -25,7 +25,7 @@ #include "mlu_op.h" -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; __mlu_func__ void setNanInfToZero(float *src, float *mask, const int num) { // band with 0x7F800000, exp bits are not all 1, mask -> 0xffffffff @@ -35,14 +35,16 @@ __mlu_func__ void setNanInfToZero(float *src, float *mask, const int num) { (int32_t *)mask), [ size ] "r"(num), [ src0 ] "r"((int32_t *)src), [ src1 ] "r"(0x7f800000), [ src2 ] "r"(0x7f800000), [ src3 ] "r"(-1)); - __bang_band((char *)src, (char *)src, (char *)mask, num * sizeof(float)); + __bang_band((int8_t *)src, (int8_t *)src, (int8_t *)mask, + num * sizeof(float)); } __mlu_func__ void safeExp(float *dst, float *src, float *mask, const int num) { setNanInfToZero(src, mask, num); __mluop_exp(dst, src, NULL, 0, num); // erase exp(0) to 0 with mask - __bang_band((char *)dst, (char *)dst, (char *)mask, num * sizeof(float)); + __bang_band((int8_t *)dst, (int8_t *)dst, (int8_t *)mask, + num * sizeof(float)); setNanInfToZero(dst, mask, num); } diff --git a/kernels/mutual_information/mutual_information_forward/mutual_information_forward.cpp b/kernels/mutual_information/mutual_information_forward/mutual_information_forward.cpp index 3373b9f79..b655237df 100644 --- a/kernels/mutual_information/mutual_information_forward/mutual_information_forward.cpp +++ b/kernels/mutual_information/mutual_information_forward/mutual_information_forward.cpp @@ -365,7 +365,7 @@ static void policyFunc3Pipeline(const mluOpHandle_t handle, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type, int batch_size) { int core_num = mluop::runtime::getClusterLimitCapability(handle) * mluop::runtime::getCoreNumOfEachUnionCapability(handle); - *k_type = CNRT_FUNC_TYPE_BLOCK; + *k_type = cnrtFuncTypeBlock; k_dim->x = 1; k_dim->y = batch_size < core_num ? batch_size : core_num; k_dim->z = 1; @@ -662,7 +662,7 @@ static mluOpStatus_t launchMutualInformationForwardDefaultKernel( int max_s_t_block_num = std::max(s_block_num, t_block_num); int min_s_t_block_num = std::min(s_block_num, t_block_num); - k_type = CNRT_FUNC_TYPE_BLOCK; + k_type = cnrtFuncTypeBlock; k_dim.y = 1; k_dim.z = 1; // Get current arch support max dim_x value diff --git a/kernels/mutual_information/mutual_information_forward/mutual_information_forward_utils.h b/kernels/mutual_information/mutual_information_forward/mutual_information_forward_utils.h index fc743a95b..78cef2560 100644 --- a/kernels/mutual_information/mutual_information_forward/mutual_information_forward_utils.h +++ b/kernels/mutual_information/mutual_information_forward/mutual_information_forward_utils.h @@ -27,7 +27,7 @@ #define MIN_LOG_DIFF_FLOAT -15.9423847198486328125f -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; __mlu_func__ void logAddVector(float *dst, float *src1, float *src2, float *max_value, float *mask, float *temp, @@ -61,13 +61,14 @@ __mlu_func__ void logAddVector(float *dst, float *src1, float *src2, ".eq(%[src1]), .mul(%[src2]);\n" ::[dst] "r"((int32_t *)mask), [ size ] "r"(data_num), [ src0 ] "r"((int32_t *)mask), [ src1 ] "r"(0x3f800000), [ src2 ] "r"(-1)); - __bang_band((char *)dst, (char *)dst, (char *)mask, data_num * sizeof(float)); + __bang_band((int8_t *)dst, (int8_t *)dst, (int8_t *)mask, + data_num * sizeof(float)); // Reverse the mask bits, ((int)mask+1)*(-1), 0->-1, -1->0 __bang_fusion(FUSION_FAM, (int *)mask, (int *)mask, 1, -1, data_num); - __bang_band((char *)max_value, (char *)max_value, (char *)mask, + __bang_band((int8_t *)max_value, (int8_t *)max_value, (int8_t *)mask, data_num * sizeof(float)); __bang_add(dst, dst, max_value, data_num); } -#endif // KERNELS_MUTUAL_INFORMATION_FORWARD_MUTUAL_INFORMATION_FORWARD_UTILS_H_ // NOLINT +#endif // KERNELS_MUTUAL_INFORMATION_FORWARD_MUTUAL_INFORMATION_FORWARD_UTILS_H_ // NOLINT diff --git a/kernels/nms_rotated/nms_rotated.cpp b/kernels/nms_rotated/nms_rotated.cpp index 2c7965917..7575e5a18 100644 --- a/kernels/nms_rotated/nms_rotated.cpp +++ b/kernels/nms_rotated/nms_rotated.cpp @@ -35,7 +35,7 @@ static void policyFunc(const mluOpHandle_t handle, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type, const int box_num) { // When current MLU arch only support Block type job if (mluop::runtime::getJobLimitCapability(handle) == CN_KERNEL_CLASS_BLOCK) { - *k_type = CNRT_FUNC_TYPE_BLOCK; + *k_type = cnrtFuncTypeBlock; k_dim->x = 1; k_dim->y = 1; k_dim->z = 1; @@ -43,7 +43,7 @@ static void policyFunc(const mluOpHandle_t handle, cnrtDim3_t *k_dim, return; } // union1 policy func - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; // dimx equals to num of mlu cores in each cluster k_dim->x = mluop::runtime::getCoreNumOfEachUnionCapability(handle); k_dim->y = 1; @@ -148,8 +148,8 @@ mluOpNmsRotated(mluOpHandle_t handle, const float iou_threshold, policyFunc(handle, &k_dim, &k_type, box_num); // transpose box [N, box_dim] -> [box_dim, N] - char *box_workspace = (char *)workspace; - char *scores_workspace = + int8_t *box_workspace = (int8_t *)workspace; + int8_t *scores_workspace = box_workspace + mluop::getSizeOfDataType(boxes_desc->dtype) * box_num * box_dim; diff --git a/kernels/nms_rotated/nms_rotated_union1.mlu b/kernels/nms_rotated/nms_rotated_union1.mlu index 0620e435f..fe583d206 100644 --- a/kernels/nms_rotated/nms_rotated_union1.mlu +++ b/kernels/nms_rotated/nms_rotated_union1.mlu @@ -32,7 +32,7 @@ #define INFINITY (340282346638528859811704183484516925440.000000) #endif -__mlu_shared__ char sram_buffer[SIZE_SRAM_BUF]; +__mlu_shared__ int8_t sram_buffer[SIZE_SRAM_BUF]; template __mlu_func__ void nms_detection( @@ -137,7 +137,7 @@ __mlu_func__ void nms_detection( (IN_DT *)((float *)box2 + SINGLE_BOX_DIM * max_seg_iou_pad); // reuse memory from dist_ram - void *rotated_pts1_x = (char *)dist_ram; + void *rotated_pts1_x = (int8_t *)dist_ram; void *rotated_pts1_y = (float *)rotated_pts1_x + 4 * max_seg_iou_pad; void *rotated_pts2_x = (float *)rotated_pts1_y + 4 * max_seg_iou_pad; void *rotated_pts2_y = (float *)rotated_pts2_x + 4 * max_seg_iou_pad; @@ -148,7 +148,7 @@ __mlu_func__ void nms_detection( void *vec2_y = (float *)vec2_x + 4 * max_seg_iou_pad; // First, initialize ram with all 0, or could cause nan/inf unexcepted results - __bang_write_zero((unsigned char *)score, copies_of_nram * max_seg_iou_pad); + __bang_write_zero((uint8_t *)score, copies_of_nram * max_seg_iou_pad); for (int keep = 0; keep < input_box_num; keep++) { __sync_cluster(); @@ -206,10 +206,8 @@ __mlu_func__ void nms_detection( if (nram_save_count == nram_save_limit_count) { if (taskId == 0) { - pvLock(); __memcpy(output_data, nram_save, nram_save_count * sizeof(OUT_DT), NRAM2GDRAM); - pvUnlock(); } output_data += nram_save_count; nram_save_count = 0; @@ -370,9 +368,9 @@ __mlu_func__ void nms_detection( (float *)temp10_ram, (float *)temp1_ram, (float *)temp2_ram, /*mode=*/0, seg_len); __bang_float2int32((int32_t *)temp9_ram, (float *)temp9_ram, seg_len, 0); - __bang_lut_s32((int32_t *)temp9_ram, (int32_t *)temp9_ram, - (int32_t *)table_float, seg_len, TABLE_LENGTH); - __bang_band((char *)temp2_ram, (char *)temp2_ram, (char *)temp9_ram, + __bang_lut((int32_t *)temp9_ram, (uint32_t *)temp9_ram, + (int32_t *)table_float, seg_len, TABLE_LENGTH); + __bang_band((int8_t *)temp2_ram, (int8_t *)temp2_ram, (int8_t *)temp9_ram, seg_len * sizeof(float)); // temp1: 1 = area_I / area_U > iou_threshold, 0 = else __bang_gt_scalar((float *)temp1_ram, (float *)temp2_ram, iou_threshold, @@ -409,18 +407,14 @@ __mlu_func__ void nms_detection( __bang_minequal((IN_DT *)score, (IN_DT *)score, (IN_DT *)temp1_ram, seg_len); - pvLock(); __memcpy(input_data_score + input_offset + i * max_seg_iou_pad, score, cpy_len * sizeof(IN_DT), scores_store_dir, cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0); - pvUnlock(); } } if (clusterId == 0 && coreId == 0 && nram_save_count) { - pvLock(); __memcpy(output_data, nram_save, nram_save_count * sizeof(OUT_DT), NRAM2GDRAM); - pvUnlock(); } } diff --git a/kernels/nms_rotated/nms_utils.h b/kernels/nms_rotated/nms_utils.h index f9a5086c0..6a419aaa7 100644 --- a/kernels/nms_rotated/nms_utils.h +++ b/kernels/nms_rotated/nms_utils.h @@ -41,7 +41,7 @@ __nram__ int16_t table_half[TABLE_LENGTH] = {0, static_cast(0xffff)}; // each box data contains 5 number: x, y, w, h, a #define SINGLE_BOX_DIM 5 -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; template __mlu_func__ void findCoreMaxBox( @@ -398,20 +398,19 @@ __mlu_func__ void getIntersectionPoints( if (sizeof(T) == sizeof(float)) { __bang_float2int32((int32_t *)temp2_ram, (float *)temp1_ram, actual_compute_box_num, 0); - __bang_lut_s32((int32_t *)temp2_ram, (int32_t *)temp2_ram, - (int32_t *)table_float, actual_compute_box_num, - TABLE_LENGTH); + __bang_lut((int32_t *)temp2_ram, (uint32_t *)temp2_ram, + (int32_t *)table_float, actual_compute_box_num, + TABLE_LENGTH); } else { __bang_half2int16_rd((int16_t *)temp2_ram, (half *)temp2_ram, actual_compute_box_num, 0); - __bang_lut_s16((int16_t *)temp2_ram, (int16_t *)temp2_ram, - (int16_t *)table_half, actual_compute_box_num, - TABLE_LENGTH); + __bang_lut((int16_t *)temp2_ram, (uint16_t *)temp2_ram, + (int16_t *)table_half, actual_compute_box_num, TABLE_LENGTH); } - __bang_band( - (char *)((T *)intersect_pts_x + (4 * i + j) * actual_compute_box_num), - (char *)temp7_ram, (char *)temp2_ram, - actual_compute_box_num * sizeof(T)); + __bang_band((int8_t *)((T *)intersect_pts_x + + (4 * i + j) * actual_compute_box_num), + (int8_t *)temp7_ram, (int8_t *)temp2_ram, + actual_compute_box_num * sizeof(T)); __bang_mul((T *)temp7_ram, (T *)vec1_y + i * actual_compute_box_num, (T *)temp6_ram, actual_compute_box_num); @@ -419,10 +418,10 @@ __mlu_func__ void getIntersectionPoints( (T *)rotated_pts1_y + i * actual_compute_box_num, (T *)temp7_ram, actual_compute_box_num); - __bang_band( - (char *)((T *)intersect_pts_y + (4 * i + j) * actual_compute_box_num), - (char *)temp7_ram, (char *)temp2_ram, - actual_compute_box_num * sizeof(T)); + __bang_band((int8_t *)((T *)intersect_pts_y + + (4 * i + j) * actual_compute_box_num), + (int8_t *)temp7_ram, (int8_t *)temp2_ram, + actual_compute_box_num * sizeof(T)); // Assign `valid_pts` bit and accumulate `nums_in` of valid points of each // box pair @@ -486,24 +485,22 @@ __mlu_func__ void getIntersectionPoints( if (sizeof(T) == sizeof(float)) { __bang_float2int32((int32_t *)temp2_ram, (float *)temp1_ram, actual_compute_box_num, 0); - __bang_lut_s32((int32_t *)temp2_ram, (int32_t *)temp2_ram, - (int32_t *)table_float, actual_compute_box_num, - TABLE_LENGTH); + __bang_lut((int32_t *)temp2_ram, (uint32_t *)temp2_ram, + (int32_t *)table_float, actual_compute_box_num, TABLE_LENGTH); } else { __bang_half2int16_rd((int16_t *)temp2_ram, (half *)temp1_ram, actual_compute_box_num, 0); - __bang_lut_s16((int16_t *)temp2_ram, (int16_t *)temp2_ram, - (int16_t *)table_half, actual_compute_box_num, - TABLE_LENGTH); + __bang_lut((int16_t *)temp2_ram, (uint16_t *)temp2_ram, + (int16_t *)table_half, actual_compute_box_num, TABLE_LENGTH); } __bang_band( - (char *)((T *)intersect_pts_x + (16 + i) * actual_compute_box_num), - (char *)((T *)rotated_pts1_x + i * actual_compute_box_num), - (char *)temp2_ram, actual_compute_box_num * sizeof(T)); + (int8_t *)((T *)intersect_pts_x + (16 + i) * actual_compute_box_num), + (int8_t *)((T *)rotated_pts1_x + i * actual_compute_box_num), + (int8_t *)temp2_ram, actual_compute_box_num * sizeof(T)); __bang_band( - (char *)((T *)intersect_pts_y + (16 + i) * actual_compute_box_num), - (char *)((T *)rotated_pts1_y + i * actual_compute_box_num), - (char *)temp2_ram, actual_compute_box_num * sizeof(T)); + (int8_t *)((T *)intersect_pts_y + (16 + i) * actual_compute_box_num), + (int8_t *)((T *)rotated_pts1_y + i * actual_compute_box_num), + (int8_t *)temp2_ram, actual_compute_box_num * sizeof(T)); // assign valid_pts bit and accumulate nums of valid points of each box pair __bang_or((T *)valid_pts + (16 + i) * actual_compute_box_num, @@ -562,24 +559,22 @@ __mlu_func__ void getIntersectionPoints( if (sizeof(T) == sizeof(float)) { __bang_float2int32((int32_t *)temp2_ram, (float *)temp1_ram, actual_compute_box_num, 0); - __bang_lut_s32((int32_t *)temp2_ram, (int32_t *)temp2_ram, - (int32_t *)table_float, actual_compute_box_num, - TABLE_LENGTH); + __bang_lut((int32_t *)temp2_ram, (uint32_t *)temp2_ram, + (int32_t *)table_float, actual_compute_box_num, TABLE_LENGTH); } else { __bang_half2int16_rd((int16_t *)temp2_ram, (half *)temp1_ram, actual_compute_box_num, 0); - __bang_lut_s16((int16_t *)temp2_ram, (int16_t *)temp2_ram, - (int16_t *)table_half, actual_compute_box_num, - TABLE_LENGTH); + __bang_lut((int16_t *)temp2_ram, (uint16_t *)temp2_ram, + (int16_t *)table_half, actual_compute_box_num, TABLE_LENGTH); } __bang_band( - (char *)((T *)intersect_pts_x + (20 + i) * actual_compute_box_num), - (char *)((T *)rotated_pts2_x + i * actual_compute_box_num), - (char *)temp2_ram, actual_compute_box_num * sizeof(T)); + (int8_t *)((T *)intersect_pts_x + (20 + i) * actual_compute_box_num), + (int8_t *)((T *)rotated_pts2_x + i * actual_compute_box_num), + (int8_t *)temp2_ram, actual_compute_box_num * sizeof(T)); __bang_band( - (char *)((T *)intersect_pts_y + (20 + i) * actual_compute_box_num), - (char *)((T *)rotated_pts2_y + i * actual_compute_box_num), - (char *)temp2_ram, actual_compute_box_num * sizeof(T)); + (int8_t *)((T *)intersect_pts_y + (20 + i) * actual_compute_box_num), + (int8_t *)((T *)rotated_pts2_y + i * actual_compute_box_num), + (int8_t *)temp2_ram, actual_compute_box_num * sizeof(T)); // assign valid_pts bit and accumulate nums of valid points of each box pair __bang_or((T *)valid_pts + (20 + i) * actual_compute_box_num, diff --git a/kernels/points_in_boxes/points_in_boxes.cpp b/kernels/points_in_boxes/points_in_boxes.cpp index 0701a1a30..b6d30a5e7 100644 --- a/kernels/points_in_boxes/points_in_boxes.cpp +++ b/kernels/points_in_boxes/points_in_boxes.cpp @@ -154,7 +154,7 @@ static bool isPointsInBoxes(const mluOpHandle_t handle, cnrtDim3_t &k_dim, uint32_t core_dim = mluop::runtime::getCoreNumOfEachUnionCapability(handle); uint32_t cluster_used = PAD_UP(points_desc->dims[1], core_dim) / core_dim; cluster_used = cluster_used > cluster_num ? cluster_num : cluster_used; - k_type = CNRT_FUNC_TYPE_BLOCK; + k_type = cnrtFuncTypeBlock; k_dim.x = 1; k_dim.y = cluster_used * core_dim; k_dim.z = 1; diff --git a/kernels/points_in_boxes/points_in_boxes_block_kernel.mlu b/kernels/points_in_boxes/points_in_boxes_block_kernel.mlu index e7e663d20..f831fd981 100644 --- a/kernels/points_in_boxes/points_in_boxes_block_kernel.mlu +++ b/kernels/points_in_boxes/points_in_boxes_block_kernel.mlu @@ -29,7 +29,7 @@ #define MARGIN 1e-5 -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; __mlu_func__ void check_point_in_box( float *x_nram_buffer, float *y_nram_buffer, float *z_nram_buffer, float cx, @@ -37,7 +37,6 @@ __mlu_func__ void check_point_in_box( int points_compute_num, float *temp0_nram_buffer, float *temp1_nram_buffer, float *temp2_nram_buffer, float *temp3_nram_buffer, float *temp4_nram_buffer) { -#if __BANG_ARCH__ >= 372 __bang_sub_scalar(temp0_nram_buffer, z_nram_buffer, cz, points_compute_num); // fabs(local_z) __bang_abs(temp0_nram_buffer, temp0_nram_buffer, points_compute_num); @@ -81,7 +80,6 @@ __mlu_func__ void check_point_in_box( (float)(0.5 * dy + MARGIN), points_compute_num); __bang_and(temp0_nram_buffer, temp0_nram_buffer, temp3_nram_buffer, points_compute_num); // flush res -#endif } __mlu_func__ void noPipelineLoad(float *points_cluster_base, @@ -101,7 +99,6 @@ __mlu_func__ void noPipelineCompute( float *temp0_nram_buffer, float *temp1_nram_buffer, float *temp2_nram_buffer, float *temp3_nram_buffer, float *temp4_nram_buffer) { -#if __BANG_ARCH__ >= 372 __bang_transpose(x_nram_buffer, points_nram_buffer, points_compute_num, 3); float *y_nram_buffer = x_nram_buffer + points_compute_num; @@ -129,7 +126,6 @@ __mlu_func__ void noPipelineCompute( points_compute_num); __bang_float2int32_tz(idx_nram_buffer, (float *)idx_nram_buffer, points_compute_num, 0); -#endif } __mlu_func__ void noPipelineStore(int *idx_cluster_base, int *idx_nram_buffer, diff --git a/kernels/poly_nms/poly_nms_block_gen_result.mlu b/kernels/poly_nms/poly_nms_block_gen_result.mlu index e0fc6691d..4eb0dd0fc 100644 --- a/kernels/poly_nms/poly_nms_block_gen_result.mlu +++ b/kernels/poly_nms/poly_nms_block_gen_result.mlu @@ -73,8 +73,8 @@ __mlu_global__ void mluGenNmsResult(int input_boxes_num, } __memcpy(mask_row_buffer, (uint32_t *)p_mask + box_id * mask_col_num, sizeof(uint32_t) * (mask_col_num), GDRAM2NRAM); - __bang_band((char *)final_mask_buffer, (char *)final_mask_buffer, - (char *)mask_row_buffer, 4 * mas_col_num_align); + __bang_band((int8_t *)final_mask_buffer, (int8_t *)final_mask_buffer, + (int8_t *)mask_row_buffer, 4 * mas_col_num_align); } if (OUTPUT_ORDER == OutputOrder::LOW_BOX_ID_FIRST) { int found = 0; diff --git a/kernels/prior_box/prior_box.cpp b/kernels/prior_box/prior_box.cpp index 5b0eb19c4..ee6a0ab2f 100644 --- a/kernels/prior_box/prior_box.cpp +++ b/kernels/prior_box/prior_box.cpp @@ -28,13 +28,13 @@ #include "core/runtime/device.h" #define api "mluOpPriorBox" -#define MLU200_500SERIERS_MAX_SUPPORT 2100 +#define MLU500SERIERS_MAX_SUPPORT 2100 #define MLU300SERIERS_MAX_SUPPORT 2900 // policy function static void policyFuncPriorBox(const mluOpHandle_t handle, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type, const int count) { - *k_type = CNRT_FUNC_TYPE_BLOCK; + *k_type = cnrtFuncTypeBlock; uint32_t cluster_max = mluop::runtime::getClusterLimitCapability(handle); uint32_t core_num_per_cluster = mluop::runtime::getCoreNumOfEachUnionCapability(handle); @@ -132,8 +132,8 @@ mluOpStatus_t mluOpPriorBoxParamCheck( const int num_priors = getNumPriors(min_sizes_desc, aspect_ratios_desc, max_sizes_desc); // check num_priors limit - const int max_support_num_priors = (handle->arch < 300 || handle->arch > 500) - ? MLU200_500SERIERS_MAX_SUPPORT + const int max_support_num_priors = (handle->arch > 500) + ? MLU500SERIERS_MAX_SUPPORT : MLU300SERIERS_MAX_SUPPORT; if (num_priors > max_support_num_priors) { LOG(ERROR) << api << " Support max num_priors is " << max_support_num_priors diff --git a/kernels/prior_box/prior_box_block.mlu b/kernels/prior_box/prior_box_block.mlu index 1c22584d1..6c3bc42bd 100644 --- a/kernels/prior_box/prior_box_block.mlu +++ b/kernels/prior_box/prior_box_block.mlu @@ -28,7 +28,7 @@ #define ALIGN_BYTE 1024 #define MIN(x, y) ((x) < (y)) ? (x) : (y) #define EPSILON 1e-6 -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; template __mlu_func__ void generate_AbAb_Mask(T *a_mask, T a_index, T *b_mask, T b_index, diff --git a/kernels/psamask/psamask.cpp b/kernels/psamask/psamask.cpp index e5dcfe33c..2ca6c6a18 100644 --- a/kernels/psamask/psamask.cpp +++ b/kernels/psamask/psamask.cpp @@ -261,7 +261,7 @@ mluOpStatus_t mluOpPsamaskForward(mluOpHandle_t handle, const int psa_type, // generate mluOpPsamaskForward prototxt end! mluOpStatus_t ret = MLUOP_STATUS_SUCCESS; - cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1; + cnrtFunctionType_t k_type = cnrtFuncTypeUnion1; cnrtDim3_t k_dim; PartitionSeg partition_info; policyFunc(handle, &k_dim, &k_type, &partition_info, n, h_feature); @@ -331,7 +331,7 @@ mluOpStatus_t mluOpPsamaskBackward(mluOpHandle_t handle, const int psa_type, // generate mluOpPsamaskBackward prototxt end! mluOpStatus_t ret = MLUOP_STATUS_SUCCESS; - cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1; + cnrtFunctionType_t k_type = cnrtFuncTypeUnion1; cnrtDim3_t k_dim; PartitionSeg partition_info; policyFunc(handle, &k_dim, &k_type, &partition_info, n, h_feature); diff --git a/kernels/psamask/psamask_union1.mlu b/kernels/psamask/psamask_union1.mlu index a0e646765..156f5356e 100644 --- a/kernels/psamask/psamask_union1.mlu +++ b/kernels/psamask/psamask_union1.mlu @@ -31,7 +31,7 @@ #define DEBUG_TASKID 0 -__nram__ char buf[MAX_NRAM_SIZE]; +__nram__ int8_t buf[MAX_NRAM_SIZE]; template __mlu_func__ void swap(T &a, T &b) { diff --git a/kernels/psroipool/psroipool.cpp b/kernels/psroipool/psroipool.cpp index d9cb5c623..5bba8273c 100644 --- a/kernels/psroipool/psroipool.cpp +++ b/kernels/psroipool/psroipool.cpp @@ -38,7 +38,7 @@ static void policyFuncPsRoiPool(const mluOpHandle_t handle, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type, const int nums) { size_t union_number = mluop::runtime::getClusterLimitCapability(handle); size_t core_in_cluster = handle->core_num_per_cluster; - *k_type = CNRT_FUNC_TYPE_UNION1; // default func type + *k_type = cnrtFuncTypeUnion1; // default func type k_dim->x = core_in_cluster; uint32_t use_cluster = (nums + core_in_cluster - 1) / core_in_cluster; k_dim->y = use_cluster > union_number ? union_number : use_cluster; diff --git a/kernels/psroipool/psroipool_block.mlu b/kernels/psroipool/psroipool_block.mlu index 11b9ad37d..1804ae1ee 100644 --- a/kernels/psroipool/psroipool_block.mlu +++ b/kernels/psroipool/psroipool_block.mlu @@ -29,27 +29,16 @@ #define ALIGN_SIZE 64 -__nram__ char nram_ptr[MAX_NRAM_SIZE]; +__nram__ int8_t nram_ptr[MAX_NRAM_SIZE]; // This function is used to align the round_rn template __mlu_func__ T scalarRound(T key) { -#if __BANG_ARCH__ >= 370 if (sizeof(T) == 2) { return (T)(__half2int_rd(T(key))); } else { return (T)(__float2int_rd(T(key))); } -#else - int key_remain = ((int)key) % 2; - int result = 0; - if (!(((key - (int)key) == 0.5) || ((key - (int)key) == -0.5))) { - return (T)(round(key)); - } else { - result = (int)(key + key_remain * 0.5); - return (T)result; - } -#endif } template diff --git a/kernels/roi_align_rotated/roi_align_rotated.cpp b/kernels/roi_align_rotated/roi_align_rotated.cpp index b98d5f4a7..3b4e8a95c 100644 --- a/kernels/roi_align_rotated/roi_align_rotated.cpp +++ b/kernels/roi_align_rotated/roi_align_rotated.cpp @@ -36,7 +36,7 @@ static void policyFunc(const mluOpHandle_t handle, const int bin_num, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) { size_t core_num = handle->core_num_per_cluster; size_t cluster_num = mluop::runtime::getJobLimitCapability(handle) / core_num; - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; k_dim->x = core_num; size_t use_cluster = (bin_num + core_num - 1) / core_num; k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster; diff --git a/kernels/roi_align_rotated/roi_align_rotated_block.mlu b/kernels/roi_align_rotated/roi_align_rotated_block.mlu index bc324aa57..684de42df 100644 --- a/kernels/roi_align_rotated/roi_align_rotated_block.mlu +++ b/kernels/roi_align_rotated/roi_align_rotated_block.mlu @@ -35,7 +35,7 @@ #define ROI_OFFSET 6 #define SAMPLING_NUM 4 -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; template __mlu_func__ void swap(T &a, T &b) { diff --git a/kernels/roi_crop/roi_crop.cpp b/kernels/roi_crop/roi_crop.cpp index c684b933e..ffdb6078c 100644 --- a/kernels/roi_crop/roi_crop.cpp +++ b/kernels/roi_crop/roi_crop.cpp @@ -36,7 +36,7 @@ static void policyFunc(const mluOpHandle_t handle, int bin_num, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) { uint32_t cluster_num = mluop::runtime::getClusterLimitCapability(handle); uint32_t core_in_cluster = handle->core_num_per_cluster; - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; k_dim->x = core_in_cluster; uint32_t use_cluster = (bin_num + core_in_cluster - 1) / core_in_cluster; k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster; diff --git a/kernels/roi_crop/roi_crop_block.mlu b/kernels/roi_crop/roi_crop_block.mlu index f3f6d7c5b..f68f804aa 100644 --- a/kernels/roi_crop/roi_crop_block.mlu +++ b/kernels/roi_crop/roi_crop_block.mlu @@ -25,7 +25,7 @@ #include "core/logging.h" #include "kernels/kernel.h" -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; template __mlu_func__ void swap(T &a, T &b) { diff --git a/kernels/roiaware_pool3d/roiaware_pool3d.cpp b/kernels/roiaware_pool3d/roiaware_pool3d.cpp index 5f46ed97e..117fdcfe4 100644 --- a/kernels/roiaware_pool3d/roiaware_pool3d.cpp +++ b/kernels/roiaware_pool3d/roiaware_pool3d.cpp @@ -51,7 +51,7 @@ static mluOpStatus_t kernelPtsIdxOfVoxelsPolicyFunc( k_dim->x = core_limit; k_dim->y = (task_dim / core_limit) > 0 ? (task_dim / core_limit) : 1; k_dim->z = 1; - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; return MLUOP_STATUS_SUCCESS; } @@ -73,7 +73,7 @@ static mluOpStatus_t kernelRoiawarePool3dForwardPolicyFunc( k_dim->x = core_limit; k_dim->y = (task_dim / core_limit) > 0 ? (task_dim / core_limit) : 1; k_dim->z = 1; - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; return MLUOP_STATUS_SUCCESS; } @@ -94,7 +94,7 @@ static mluOpStatus_t kernelRoiawarePool3dBackwardPolicyFunc( k_dim->x = core_limit; k_dim->y = (task_dim / core_limit) > 0 ? (task_dim / core_limit) : 1; k_dim->z = 1; - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; return MLUOP_STATUS_SUCCESS; } @@ -373,9 +373,9 @@ mluOpStatus_t MLUOP_WIN_API mluOpRoiAwarePool3dForward( uint64_t pts_feature_dtype_size = mluOpGetTensorElementNum(pts_feature_desc) * mluop::getSizeOfDataType(data_dtype); void *pts_workspace = workspace; - void *pts_feature_workspace = (char *)pts_workspace + pts_dtype_size; + void *pts_feature_workspace = (int8_t *)pts_workspace + pts_dtype_size; void *transpose_workspace = - (char *)pts_feature_workspace + pts_feature_dtype_size; + (int8_t *)pts_feature_workspace + pts_feature_dtype_size; VLOG(5) << "[mluOpRoiAwarePool3dForward] cnnlTranspose pts start."; int pts_dim = pts_desc->dim; diff --git a/kernels/roiaware_pool3d/roiaware_pool3d_union1.mlu b/kernels/roiaware_pool3d/roiaware_pool3d_union1.mlu index 09d7c9cea..912de919d 100644 --- a/kernels/roiaware_pool3d/roiaware_pool3d_union1.mlu +++ b/kernels/roiaware_pool3d/roiaware_pool3d_union1.mlu @@ -34,14 +34,13 @@ #define HALF_NRAM_BUFFER_NUM 25 #define ALIGN_NUM 64 -__nram__ char data_nram[MAX_NRAM_SIZE]; +__nram__ int8_t data_nram[MAX_NRAM_SIZE]; template __mlu_entry__ void MLUMultiKernelPtsIdxOfVoxels( const int pool_method, const int boxes_num, const int pts_num, const int max_pts_each_voxel, const int out_x, const int out_y, const int out_z, const T *rois, const T *pts, int *pts_idx_of_voxels) { -#if __BANG_ARCH__ >= 322 if (__is_mpu()) { return; } @@ -54,13 +53,13 @@ __mlu_entry__ void MLUMultiKernelPtsIdxOfVoxels( (MAX_NRAM_SIZE / sizeof(half) / HALF_NRAM_BUFFER_NUM), ALIGN_NUM); } - char *X = NULL; - char *Y = NULL; - char *Z = NULL; - char *local_X = NULL; - char *local_Y = NULL; - char *local_Z = NULL; - char *nram_pts_in_flag = NULL; + int8_t *X = NULL; + int8_t *Y = NULL; + int8_t *Z = NULL; + int8_t *local_X = NULL; + int8_t *local_Y = NULL; + int8_t *local_Z = NULL; + int8_t *nram_pts_in_flag = NULL; float *temp_buffer1 = NULL; float *temp_buffer2 = NULL; float *temp_buffer3 = NULL; @@ -73,13 +72,13 @@ __mlu_entry__ void MLUMultiKernelPtsIdxOfVoxels( float *fp_local_Z = NULL; float *fp_nram_pts_in_flag = NULL; if (__mluop_is_float()) { - X = (char *)((float *)data_nram); - Y = (char *)((float *)data_nram + nram_pts_num); - Z = (char *)((float *)data_nram + nram_pts_num * 2); - local_X = (char *)((float *)data_nram + nram_pts_num * 3); - local_Y = (char *)((float *)data_nram + nram_pts_num * 4); - local_Z = (char *)((float *)data_nram + nram_pts_num * 5); - nram_pts_in_flag = (char *)((float *)data_nram + nram_pts_num * 6); + X = (int8_t *)((float *)data_nram); + Y = (int8_t *)((float *)data_nram + nram_pts_num); + Z = (int8_t *)((float *)data_nram + nram_pts_num * 2); + local_X = (int8_t *)((float *)data_nram + nram_pts_num * 3); + local_Y = (int8_t *)((float *)data_nram + nram_pts_num * 4); + local_Z = (int8_t *)((float *)data_nram + nram_pts_num * 5); + nram_pts_in_flag = (int8_t *)((float *)data_nram + nram_pts_num * 6); temp_buffer1 = (float *)data_nram + nram_pts_num * 7; temp_buffer2 = (float *)data_nram + nram_pts_num * 8; temp_buffer3 = (float *)data_nram + nram_pts_num * 9; @@ -92,13 +91,13 @@ __mlu_entry__ void MLUMultiKernelPtsIdxOfVoxels( fp_local_Z = (float *)local_Z; fp_nram_pts_in_flag = (float *)nram_pts_in_flag; } else { - X = (char *)((half *)data_nram); - Y = (char *)((half *)data_nram + nram_pts_num); - Z = (char *)((half *)data_nram + nram_pts_num * 2); - local_X = (char *)((half *)data_nram + nram_pts_num * 4); - local_Y = (char *)((half *)data_nram + nram_pts_num * 6); - local_Z = (char *)((half *)data_nram + nram_pts_num * 8); - nram_pts_in_flag = (char *)((half *)data_nram + nram_pts_num * 10); + X = (int8_t *)((half *)data_nram); + Y = (int8_t *)((half *)data_nram + nram_pts_num); + Z = (int8_t *)((half *)data_nram + nram_pts_num * 2); + local_X = (int8_t *)((half *)data_nram + nram_pts_num * 4); + local_Y = (int8_t *)((half *)data_nram + nram_pts_num * 6); + local_Z = (int8_t *)((half *)data_nram + nram_pts_num * 8); + nram_pts_in_flag = (int8_t *)((half *)data_nram + nram_pts_num * 10); temp_buffer1 = (float *)((half *)data_nram + nram_pts_num * 11); temp_buffer2 = (float *)((half *)data_nram + nram_pts_num * 13); temp_buffer3 = (float *)((half *)data_nram + nram_pts_num * 15); @@ -208,7 +207,7 @@ __mlu_entry__ void MLUMultiKernelPtsIdxOfVoxels( __bang_add_scalar((T *)local_Y, (T *)local_Y, (T)(dy_2), compute_pts_num); // local_Z do not need to add dz/2.0 -#if (__BANG_ARCH__ >= 322) && (__BANG_ARCH__ != 372) +#if __BANG_ARCH__ != 372 __bang_div((T *)local_X, (T *)local_X, (T)x_res, compute_pts_num); __bang_div((T *)local_Y, (T *)local_Y, (T)y_res, compute_pts_num); __bang_div((T *)local_Z, (T *)local_Z, (T)z_res, compute_pts_num); @@ -308,7 +307,6 @@ __mlu_entry__ void MLUMultiKernelPtsIdxOfVoxels( } } } -#endif } template @@ -321,7 +319,6 @@ __mlu_entry__ void MLUMultiKernelRoiawarePool3dForward( // pts_idx_of_voxels: (boxes_num, out_x, out_y, out_z, max_pts_each_voxel) // argmax: (boxes_num, out_x, out_y, out_z, channels) // pooled_features: (boxes_num, out_x, out_y, out_z, channels) -#if __BANG_ARCH__ >= 322 if (__is_mpu()) { return; } @@ -343,8 +340,8 @@ __mlu_entry__ void MLUMultiKernelRoiawarePool3dForward( int *nram_argmax_cur_voxel = (int *)((T *)nram_pooled_features_cur_voxel + nram_channels_limit); // nram_argmax_cur_voxel [nram_channels_limit] - char *one_pooled_feature = - (char *)((int *)nram_argmax_cur_voxel + nram_channels_limit); + int8_t *one_pooled_feature = + (int8_t *)((int *)nram_argmax_cur_voxel + nram_channels_limit); // one_pooled_feature [128] int channels_loop_times = channels / nram_channels_limit; int rem_channels = channels % nram_channels_limit; @@ -414,7 +411,6 @@ __mlu_entry__ void MLUMultiKernelRoiawarePool3dForward( } } } -#endif } mluOpStatus_t MLUOP_WIN_API KernelPtsIdxOfVoxels( @@ -480,7 +476,6 @@ __mlu_entry__ void MLUMultiKernelRoiawareMaxPool3dBackward( // argmax: (boxes_num, out_x, out_y, out_z, channels) // grad_out: (boxes_num, out_x, out_y, out_z, channels) // grad_in: (pts_num, channels) -#if __BANG_ARCH__ >= 372 if (__is_mpu()) { return; } @@ -531,7 +526,6 @@ __mlu_entry__ void MLUMultiKernelRoiawareMaxPool3dBackward( } } } -#endif } template @@ -542,7 +536,6 @@ __mlu_entry__ void MLUMultiKernelRoiawareAvgPool3dBackward( // pts_idx_of_voxels: (boxes_num, out_x, out_y, out_z, max_pts_each_voxel) // grad_out: (boxes_num, out_x, out_y, out_z, channels) // grad_in: (pts_num, channels) -#if __BANG_ARCH__ >= 372 if (__is_mpu()) { return; } @@ -614,7 +607,6 @@ __mlu_entry__ void MLUMultiKernelRoiawareAvgPool3dBackward( } } } -#endif } mluOpStatus_t MLUOP_WIN_API KernelRoiawarePool3dBackward( diff --git a/kernels/roipoint_pool3d/roipoint_pool3d.cpp b/kernels/roipoint_pool3d/roipoint_pool3d.cpp index 30b1ce7a2..b4d363ae5 100644 --- a/kernels/roipoint_pool3d/roipoint_pool3d.cpp +++ b/kernels/roipoint_pool3d/roipoint_pool3d.cpp @@ -280,10 +280,10 @@ mluOpStatus_t MLUOP_WIN_API mluOpRoiPointPool3d( void *points_xyz = workspace; // point_features : [B, C, N] void *point_features_transpose = - (char *)workspace + + (int8_t *)workspace + points_element_num * mluop::getSizeOfDataType(points_desc->dtype); void *transpose_workspace = - (char *)point_features_transpose + + (int8_t *)point_features_transpose + point_features_element_num * mluop::getSizeOfDataType(point_features_desc->dtype); size_t transpose_workspace_size = 0; @@ -370,7 +370,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpRoiPointPool3d( k_dims.x = mluop::runtime::getCoreNumOfEachUnionCapability(handle); k_dims.y = mluop::runtime::getClusterLimitCapability(handle); k_dims.z = 1; - cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1; + cnrtFunctionType_t k_type = cnrtFuncTypeUnion1; if (boxes_num <= 10240) { VLOG(5) << "Launch Kernel KernelRoipointPool3d<<queue, points_desc->dtype, batch_size, pts_num, boxes_num, feature_in_len, - sampled_pts_num, (char *)points_xyz, - (char *)point_features_transpose, (char *)boxes3d, - (char *)pooled_features, (char *)pooled_empty_flag)); + sampled_pts_num, (int8_t *)points_xyz, + (int8_t *)point_features_transpose, (int8_t *)boxes3d, + (int8_t *)pooled_features, (int8_t *)pooled_empty_flag)); } else { VLOG(5) << "Launch Kernel KernelRoipointPool3dLargeBoxesNum<<queue, points_desc->dtype, batch_size, pts_num, boxes_num, feature_in_len, - sampled_pts_num, (char *)points_xyz, - (char *)point_features_transpose, (char *)boxes3d, - (char *)pooled_features, (char *)pooled_empty_flag)); + sampled_pts_num, (int8_t *)points_xyz, + (int8_t *)point_features_transpose, (int8_t *)boxes3d, + (int8_t *)pooled_features, (int8_t *)pooled_empty_flag)); } GEN_CASE_END(); return MLUOP_STATUS_SUCCESS; diff --git a/kernels/roipoint_pool3d/roipoint_pool3d.h b/kernels/roipoint_pool3d/roipoint_pool3d.h index e7fe95020..631b0a461 100644 --- a/kernels/roipoint_pool3d/roipoint_pool3d.h +++ b/kernels/roipoint_pool3d/roipoint_pool3d.h @@ -29,16 +29,16 @@ mluOpStatus_t MLUOP_WIN_API KernelRoipointPool3d( cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, mluOpDataType_t d_type, const int batch_size, const int pts_num, const int boxes_num, const int feature_in_len, const int sampled_pts_num, - const char *points_xyz_gdram, const char *point_features_gdram, - const char *boxes3d_gdram, char *pooled_features_gdram, - char *pooled_empty_flag_gdram); + const int8_t *points_xyz_gdram, const int8_t *point_features_gdram, + const int8_t *boxes3d_gdram, int8_t *pooled_features_gdram, + int8_t *pooled_empty_flag_gdram); mluOpStatus_t MLUOP_WIN_API KernelRoipointPool3dLargeBoxesNum( cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, mluOpDataType_t d_type, const int batch_size, const int pts_num, const int boxes_num, const int feature_in_len, const int sampled_pts_num, - const char *points_xyz_gdram, const char *point_features_gdram, - const char *boxes3d_gdram, char *pooled_features_gdram, - char *pooled_empty_flag_gdram); + const int8_t *points_xyz_gdram, const int8_t *point_features_gdram, + const int8_t *boxes3d_gdram, int8_t *pooled_features_gdram, + int8_t *pooled_empty_flag_gdram); #endif // KERNELS_ROIPOINT_POOL3D_ROIPOINT_POOL3D_H diff --git a/kernels/roipoint_pool3d/roipoint_pool3d_union1.mlu b/kernels/roipoint_pool3d/roipoint_pool3d_union1.mlu index 53187f331..ab06ec577 100644 --- a/kernels/roipoint_pool3d/roipoint_pool3d_union1.mlu +++ b/kernels/roipoint_pool3d/roipoint_pool3d_union1.mlu @@ -39,7 +39,7 @@ ***********************************************************************************************/ #define TWELVE_SPLIT 12 -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; template __mlu_func__ void checkPointsInBox3d(const T *boxes3d, const size_t deal_num, @@ -71,12 +71,7 @@ __mlu_func__ void checkPointsInBox3d(const T *boxes3d, const size_t deal_num, // |z - cz| __bang_active_abs((T *)auxiliary_c, (T *)auxiliary_c, deal_num); // |z - cz| > dz / 2.0 -#if __BANG_ARCH__ >= 322 __bang_gt_scalar((T *)auxiliary_c, (T *)auxiliary_c, (T)(0.5 * dz), deal_num); -#else - __bang_write_value((T *)auxiliary_d, deal_num, (T)(0.5 * dz)); - __bang_lt((T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_c, deal_num); -#endif // !(|z - cz| > dz / 2.0) __bang_not((T *)auxiliary_c, (T *)auxiliary_c, deal_num); // (x - cx) * cos(-rz) @@ -88,12 +83,7 @@ __mlu_func__ void checkPointsInBox3d(const T *boxes3d, const size_t deal_num, // |local_x| __bang_active_abs((T *)auxiliary_d, (T *)auxiliary_d, deal_num); // |local_x| < dx / 2.0 -#if __BANG_ARCH__ >= 322 __bang_lt_scalar(auxiliary_d, auxiliary_d, (T)(0.5 * dx), deal_num); -#else - __bang_write_value((T *)auxiliary_e, deal_num, (T)(0.5 * dx)); - __bang_gt((T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_d, deal_num); -#endif // (x - cx) * sin(-rz) __bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_a, (T)sina, deal_num); // (y - cy) * cos(-rz) @@ -103,12 +93,7 @@ __mlu_func__ void checkPointsInBox3d(const T *boxes3d, const size_t deal_num, // |local_y| __bang_active_abs((T *)auxiliary_e, (T *)auxiliary_e, deal_num); // |local_y| < dy / 2.0 -#if __BANG_ARCH__ >= 322 __bang_lt_scalar(auxiliary_e, auxiliary_e, (T)(0.5 * dy), deal_num); -#else - __bang_write_value((T *)auxiliary_f, deal_num, (T)(0.5 * dy)); - __bang_gt((T *)auxiliary_e, (T *)auxiliary_f, (T *)auxiliary_e, deal_num); -#endif // pts_assign = |x - cx| < dx / 2.0 && |y - cy| < dy / 2.0 && |z - cz| <= dz // / 2.0 __bang_mul((T *)pts_assign, (T *)auxiliary_c, (T *)auxiliary_d, deal_num); @@ -117,13 +102,14 @@ __mlu_func__ void checkPointsInBox3d(const T *boxes3d, const size_t deal_num, template __mlu_func__ void computeStoreRoipointPool3d( - char *boxes3d, int *cnt, char *points_x, char *points_y, char *points_z, - const char *point_features, char *auxiliary_a, char *auxiliary_b, - char *auxiliary_c, char *auxiliary_d, char *auxiliary_e, char *auxiliary_f, - const int box_idx, const int pts_num, const int feature_in_len, - const int sampled_pts_num, const size_t span_num_deal, - char *pooled_features_gdram, char *pooled_empty_flag_gdram) { - char *pts_assign = auxiliary_a; + int8_t *boxes3d, int *cnt, int8_t *points_x, int8_t *points_y, + int8_t *points_z, const int8_t *point_features, int8_t *auxiliary_a, + int8_t *auxiliary_b, int8_t *auxiliary_c, int8_t *auxiliary_d, + int8_t *auxiliary_e, int8_t *auxiliary_f, const int box_idx, + const int pts_num, const int feature_in_len, const int sampled_pts_num, + const size_t span_num_deal, int8_t *pooled_features_gdram, + int8_t *pooled_empty_flag_gdram) { + int8_t *pts_assign = auxiliary_a; if (cnt[box_idx] >= sampled_pts_num) { return; } @@ -193,14 +179,14 @@ __mlu_func__ void computeStoreRoipointPool3d( template __mlu_func__ void computeStoreLastBlockRoipointPool3d( - char *boxes3d, int *cnt, char *points_x, char *points_y, char *points_z, - const char *point_features, char *auxiliary_a, char *auxiliary_b, - char *auxiliary_c, char *auxiliary_d, char *auxiliary_e, char *auxiliary_f, - const int box_idx, const int pts_num, const int feature_in_len, - const int sampled_pts_num, const size_t span_num_deal, - const size_t auxiliary_num_deal, char *pooled_features_gdram, - char *pooled_empty_flag_gdram) { - char *pts_assign = auxiliary_a; + int8_t *boxes3d, int *cnt, int8_t *points_x, int8_t *points_y, + int8_t *points_z, const int8_t *point_features, int8_t *auxiliary_a, + int8_t *auxiliary_b, int8_t *auxiliary_c, int8_t *auxiliary_d, + int8_t *auxiliary_e, int8_t *auxiliary_f, const int box_idx, + const int pts_num, const int feature_in_len, const int sampled_pts_num, + const size_t span_num_deal, const size_t auxiliary_num_deal, + int8_t *pooled_features_gdram, int8_t *pooled_empty_flag_gdram) { + int8_t *pts_assign = auxiliary_a; if (cnt[box_idx] >= sampled_pts_num) { // pooled_empty_flag_gdram set 0 *((int *)auxiliary_a) = 0; @@ -329,9 +315,9 @@ template __mlu_global__ void MLUKernelRoipointPool3d( const int batch_size, const int pts_num, const int boxes_num, const int feature_in_len, const int sampled_pts_num, - const char *points_xyz_gdram, const char *point_features_gdram, - const char *boxes3d_gdram, char *pooled_features_gdram, - char *pooled_empty_flag_gdram) { + const int8_t *points_xyz_gdram, const int8_t *point_features_gdram, + const int8_t *boxes3d_gdram, int8_t *pooled_features_gdram, + int8_t *pooled_empty_flag_gdram) { if (__is_mpu()) { return; } @@ -356,10 +342,10 @@ __mlu_global__ void MLUKernelRoipointPool3d( : ((taskId + 1) * boxes_per_core + boxes_rem) - batch_end * boxes_num; // points_xyz : [3, B, N] - const char *points_x_gdram = points_xyz_gdram; - const char *points_y_gdram = + const int8_t *points_x_gdram = points_xyz_gdram; + const int8_t *points_y_gdram = points_xyz_gdram + (1 * batch_size * pts_num) * sizeof(T); - const char *points_z_gdram = + const int8_t *points_z_gdram = points_xyz_gdram + (2 * batch_size * pts_num) * sizeof(T); size_t boxes3d_size = PAD_UP(boxes_num * 7, NFU_ALIGN_SIZE) * sizeof(T); @@ -371,18 +357,18 @@ __mlu_global__ void MLUKernelRoipointPool3d( int32_t repeat = pts_num / span_num_deal; size_t rem = pts_num % span_num_deal; size_t align_rem = CEIL_ALIGN(rem, align_num); - char *boxes3d = nram_buffer; - char *cnt = nram_buffer + boxes3d_size; - char *ping_points_x = cnt + cnt_size; - char *ping_points_y = ping_points_x + span_num_deal * sizeof(T); - char *ping_points_z = ping_points_y + span_num_deal * sizeof(T); + int8_t *boxes3d = nram_buffer; + int8_t *cnt = nram_buffer + boxes3d_size; + int8_t *ping_points_x = cnt + cnt_size; + int8_t *ping_points_y = ping_points_x + span_num_deal * sizeof(T); + int8_t *ping_points_z = ping_points_y + span_num_deal * sizeof(T); size_t ping_pong_gap = 3 * span_num_deal * sizeof(T); - char *auxiliary_a = ping_points_x + 2 * ping_pong_gap; - char *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T); - char *auxiliary_c = auxiliary_b + span_num_deal * sizeof(T); - char *auxiliary_d = auxiliary_c + span_num_deal * sizeof(T); - char *auxiliary_e = auxiliary_d + span_num_deal * sizeof(T); - char *auxiliary_f = auxiliary_e + span_num_deal * sizeof(T); + int8_t *auxiliary_a = ping_points_x + 2 * ping_pong_gap; + int8_t *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T); + int8_t *auxiliary_c = auxiliary_b + span_num_deal * sizeof(T); + int8_t *auxiliary_d = auxiliary_c + span_num_deal * sizeof(T); + int8_t *auxiliary_e = auxiliary_d + span_num_deal * sizeof(T); + int8_t *auxiliary_f = auxiliary_e + span_num_deal * sizeof(T); size_t span_load_input1_size = span_num_deal * sizeof(T); size_t span_load_input2_size = span_num_deal * sizeof(T); size_t span_load_input3_size = span_num_deal * sizeof(T); @@ -393,16 +379,19 @@ __mlu_global__ void MLUKernelRoipointPool3d( boxes_num * 7 * sizeof(T), GDRAM2NRAM); __bang_write_zero((int *)cnt, PAD_UP(boxes_num, NFU_ALIGN_SIZE)); - const char *points_x_start = points_x_gdram + bs_idx * pts_num * sizeof(T); - const char *points_y_start = points_y_gdram + bs_idx * pts_num * sizeof(T); - const char *points_z_start = points_z_gdram + bs_idx * pts_num * sizeof(T); - const char *point_features_start = + const int8_t *points_x_start = + points_x_gdram + bs_idx * pts_num * sizeof(T); + const int8_t *points_y_start = + points_y_gdram + bs_idx * pts_num * sizeof(T); + const int8_t *points_z_start = + points_z_gdram + bs_idx * pts_num * sizeof(T); + const int8_t *point_features_start = point_features_gdram + bs_idx * feature_in_len * pts_num * sizeof(T); - char *pooled_features_start = + int8_t *pooled_features_start = pooled_features_gdram + (bs_idx * boxes_num * sampled_pts_num * (3 + feature_in_len)) * sizeof(T); - char *pooled_empty_flag_start = + int8_t *pooled_empty_flag_start = pooled_empty_flag_gdram + bs_idx * boxes_num * sizeof(int); size_t box_start = bs_idx == batch_start ? first_batch_box_start : 0; size_t box_end = bs_idx == batch_end ? last_batch_box_end : boxes_num; @@ -520,24 +509,24 @@ __mlu_global__ void MLUKernelRoipointPool3d( template __mlu_global__ void MLUKernelRoipointPool3d( const int batch_size, const int pts_num, const int boxes_num, const int feature_in_len, const int sampled_pts_num, - const char *points_xyz_gdram, const char *point_features_gdram, - const char *boxes3d_gdram, char *pooled_features_gdram, - char *pooled_empty_flag_gdram); + const int8_t *points_xyz_gdram, const int8_t *point_features_gdram, + const int8_t *boxes3d_gdram, int8_t *pooled_features_gdram, + int8_t *pooled_empty_flag_gdram); template __mlu_global__ void MLUKernelRoipointPool3d( const int batch_size, const int pts_num, const int boxes_num, const int feature_in_len, const int sampled_pts_num, - const char *points_xyz_gdram, const char *point_features_gdram, - const char *boxes3d_gdram, char *pooled_features_gdram, - char *pooled_empty_flag_gdram); + const int8_t *points_xyz_gdram, const int8_t *point_features_gdram, + const int8_t *boxes3d_gdram, int8_t *pooled_features_gdram, + int8_t *pooled_empty_flag_gdram); mluOpStatus_t MLUOP_WIN_API KernelRoipointPool3d( cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, mluOpDataType_t d_type, const int batch_size, const int pts_num, const int boxes_num, const int feature_in_len, const int sampled_pts_num, - const char *points_xyz_gdram, const char *point_features_gdram, - const char *boxes3d_gdram, char *pooled_features_gdram, - char *pooled_empty_flag_gdram) { + const int8_t *points_xyz_gdram, const int8_t *point_features_gdram, + const int8_t *boxes3d_gdram, int8_t *pooled_features_gdram, + int8_t *pooled_empty_flag_gdram) { switch (d_type) { /* Only float and half data types are supported in host-side CPP file fool-proof processing. */ diff --git a/kernels/roipoint_pool3d/roipoint_pool3d_union1_large_boxes_num.mlu b/kernels/roipoint_pool3d/roipoint_pool3d_union1_large_boxes_num.mlu index 1af17a0bc..fe2631d95 100644 --- a/kernels/roipoint_pool3d/roipoint_pool3d_union1_large_boxes_num.mlu +++ b/kernels/roipoint_pool3d/roipoint_pool3d_union1_large_boxes_num.mlu @@ -37,7 +37,7 @@ ***********************************************************************************************/ #define TWELVE_SPLIT 12 -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; template __mlu_func__ void checkPointsInBox3d(const T *boxes3d, const size_t deal_num, @@ -69,12 +69,7 @@ __mlu_func__ void checkPointsInBox3d(const T *boxes3d, const size_t deal_num, // |z - cz| __bang_active_abs((T *)auxiliary_c, (T *)auxiliary_c, deal_num); // |z - cz| > dz / 2.0 -#if __BANG_ARCH__ >= 322 __bang_gt_scalar((T *)auxiliary_c, (T *)auxiliary_c, (T)(0.5 * dz), deal_num); -#else - __bang_write_value((T *)auxiliary_d, deal_num, (T)(0.5 * dz)); - __bang_lt((T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_c, deal_num); -#endif // !(|z - cz| > dz / 2.0) __bang_not((T *)auxiliary_c, (T *)auxiliary_c, deal_num); // (x - cx) * cos(-rz) @@ -86,12 +81,7 @@ __mlu_func__ void checkPointsInBox3d(const T *boxes3d, const size_t deal_num, // |local_x| __bang_active_abs((T *)auxiliary_d, (T *)auxiliary_d, deal_num); // |local_x| < dx / 2.0 -#if __BANG_ARCH__ >= 322 __bang_lt_scalar(auxiliary_d, auxiliary_d, (T)(0.5 * dx), deal_num); -#else - __bang_write_value((T *)auxiliary_e, deal_num, (T)(0.5 * dx)); - __bang_gt((T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_d, deal_num); -#endif // (x - cx) * sin(-rz) __bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_a, (T)sina, deal_num); // (y - cy) * cos(-rz) @@ -101,12 +91,7 @@ __mlu_func__ void checkPointsInBox3d(const T *boxes3d, const size_t deal_num, // |local_y| __bang_active_abs((T *)auxiliary_e, (T *)auxiliary_e, deal_num); // |local_y| < dy / 2.0 -#if __BANG_ARCH__ >= 322 __bang_lt_scalar(auxiliary_e, auxiliary_e, (T)(0.5 * dy), deal_num); -#else - __bang_write_value((T *)auxiliary_f, deal_num, (T)(0.5 * dy)); - __bang_gt((T *)auxiliary_e, (T *)auxiliary_f, (T *)auxiliary_e, deal_num); -#endif // pts_assign = |x - cx| < dx / 2.0 && |y - cy| < dy / 2.0 && |z - cz| <= dz // / 2.0 __bang_mul((T *)pts_assign, (T *)auxiliary_c, (T *)auxiliary_d, deal_num); @@ -115,13 +100,14 @@ __mlu_func__ void checkPointsInBox3d(const T *boxes3d, const size_t deal_num, template __mlu_func__ void computeStoreRoipointPool3d( - char *boxes3d, int *cnt, char *points_x, char *points_y, char *points_z, - const char *point_features, char *auxiliary_a, char *auxiliary_b, - char *auxiliary_c, char *auxiliary_d, char *auxiliary_e, char *auxiliary_f, - const int box_idx, const int pts_num, const int feature_in_len, - const int sampled_pts_num, const size_t span_num_deal, - char *pooled_features_gdram, char *pooled_empty_flag_gdram) { - char *pts_assign = auxiliary_a; + int8_t *boxes3d, int *cnt, int8_t *points_x, int8_t *points_y, + int8_t *points_z, const int8_t *point_features, int8_t *auxiliary_a, + int8_t *auxiliary_b, int8_t *auxiliary_c, int8_t *auxiliary_d, + int8_t *auxiliary_e, int8_t *auxiliary_f, const int box_idx, + const int pts_num, const int feature_in_len, const int sampled_pts_num, + const size_t span_num_deal, int8_t *pooled_features_gdram, + int8_t *pooled_empty_flag_gdram) { + int8_t *pts_assign = auxiliary_a; if (*cnt >= sampled_pts_num) { return; } @@ -190,14 +176,14 @@ __mlu_func__ void computeStoreRoipointPool3d( template __mlu_func__ void computeStoreLastBlockRoipointPool3d( - char *boxes3d, int *cnt, char *points_x, char *points_y, char *points_z, - const char *point_features, char *auxiliary_a, char *auxiliary_b, - char *auxiliary_c, char *auxiliary_d, char *auxiliary_e, char *auxiliary_f, - const int box_idx, const int pts_num, const int feature_in_len, - const int sampled_pts_num, const size_t span_num_deal, - const size_t auxiliary_num_deal, char *pooled_features_gdram, - char *pooled_empty_flag_gdram) { - char *pts_assign = auxiliary_a; + int8_t *boxes3d, int *cnt, int8_t *points_x, int8_t *points_y, + int8_t *points_z, const int8_t *point_features, int8_t *auxiliary_a, + int8_t *auxiliary_b, int8_t *auxiliary_c, int8_t *auxiliary_d, + int8_t *auxiliary_e, int8_t *auxiliary_f, const int box_idx, + const int pts_num, const int feature_in_len, const int sampled_pts_num, + const size_t span_num_deal, const size_t auxiliary_num_deal, + int8_t *pooled_features_gdram, int8_t *pooled_empty_flag_gdram) { + int8_t *pts_assign = auxiliary_a; if (*cnt >= sampled_pts_num) { // pooled_empty_flag_gdram set 0 *((int *)auxiliary_a) = 0; @@ -327,9 +313,9 @@ template __mlu_global__ void MLUKernelRoipointPool3dLargeBoxesNum( const int batch_size, const int pts_num, const int boxes_num, const int feature_in_len, const int sampled_pts_num, - const char *points_xyz_gdram, const char *point_features_gdram, - const char *boxes3d_gdram, char *pooled_features_gdram, - char *pooled_empty_flag_gdram) { + const int8_t *points_xyz_gdram, const int8_t *point_features_gdram, + const int8_t *boxes3d_gdram, int8_t *pooled_features_gdram, + int8_t *pooled_empty_flag_gdram) { if (__is_mpu()) { return; } @@ -354,10 +340,10 @@ __mlu_global__ void MLUKernelRoipointPool3dLargeBoxesNum( : ((taskId + 1) * boxes_per_core + boxes_rem) - batch_end * boxes_num; // points_xyz : [3, B, N] - const char *points_x_gdram = points_xyz_gdram; - const char *points_y_gdram = + const int8_t *points_x_gdram = points_xyz_gdram; + const int8_t *points_y_gdram = points_xyz_gdram + (1 * batch_size * pts_num) * sizeof(T); - const char *points_z_gdram = + const int8_t *points_z_gdram = points_xyz_gdram + (2 * batch_size * pts_num) * sizeof(T); size_t boxes3d_size = PAD_UP(7, NFU_ALIGN_SIZE) * sizeof(T); @@ -368,17 +354,17 @@ __mlu_global__ void MLUKernelRoipointPool3dLargeBoxesNum( int32_t repeat = pts_num / span_num_deal; size_t rem = pts_num % span_num_deal; size_t align_rem = CEIL_ALIGN(rem, align_num); - char *boxes3d = nram_buffer; - char *ping_points_x = nram_buffer + boxes3d_size; - char *ping_points_y = ping_points_x + span_num_deal * sizeof(T); - char *ping_points_z = ping_points_y + span_num_deal * sizeof(T); + int8_t *boxes3d = nram_buffer; + int8_t *ping_points_x = nram_buffer + boxes3d_size; + int8_t *ping_points_y = ping_points_x + span_num_deal * sizeof(T); + int8_t *ping_points_z = ping_points_y + span_num_deal * sizeof(T); size_t ping_pong_gap = 3 * span_num_deal * sizeof(T); - char *auxiliary_a = ping_points_x + 2 * ping_pong_gap; - char *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T); - char *auxiliary_c = auxiliary_b + span_num_deal * sizeof(T); - char *auxiliary_d = auxiliary_c + span_num_deal * sizeof(T); - char *auxiliary_e = auxiliary_d + span_num_deal * sizeof(T); - char *auxiliary_f = auxiliary_e + span_num_deal * sizeof(T); + int8_t *auxiliary_a = ping_points_x + 2 * ping_pong_gap; + int8_t *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T); + int8_t *auxiliary_c = auxiliary_b + span_num_deal * sizeof(T); + int8_t *auxiliary_d = auxiliary_c + span_num_deal * sizeof(T); + int8_t *auxiliary_e = auxiliary_d + span_num_deal * sizeof(T); + int8_t *auxiliary_f = auxiliary_e + span_num_deal * sizeof(T); size_t span_load_input1_size = span_num_deal * sizeof(T); size_t span_load_input2_size = span_num_deal * sizeof(T); size_t span_load_input3_size = span_num_deal * sizeof(T); @@ -386,16 +372,19 @@ __mlu_global__ void MLUKernelRoipointPool3dLargeBoxesNum( int cnt = 0; for (int bs_idx = batch_start; bs_idx <= batch_end; bs_idx++) { - const char *points_x_start = points_x_gdram + bs_idx * pts_num * sizeof(T); - const char *points_y_start = points_y_gdram + bs_idx * pts_num * sizeof(T); - const char *points_z_start = points_z_gdram + bs_idx * pts_num * sizeof(T); - const char *point_features_start = + const int8_t *points_x_start = + points_x_gdram + bs_idx * pts_num * sizeof(T); + const int8_t *points_y_start = + points_y_gdram + bs_idx * pts_num * sizeof(T); + const int8_t *points_z_start = + points_z_gdram + bs_idx * pts_num * sizeof(T); + const int8_t *point_features_start = point_features_gdram + bs_idx * feature_in_len * pts_num * sizeof(T); - char *pooled_features_start = + int8_t *pooled_features_start = pooled_features_gdram + (bs_idx * boxes_num * sampled_pts_num * (3 + feature_in_len)) * sizeof(T); - char *pooled_empty_flag_start = + int8_t *pooled_empty_flag_start = pooled_empty_flag_gdram + bs_idx * boxes_num * sizeof(int); size_t box_start = bs_idx == batch_start ? first_batch_box_start : 0; size_t box_end = bs_idx == batch_end ? last_batch_box_end : boxes_num; @@ -516,24 +505,24 @@ __mlu_global__ void MLUKernelRoipointPool3dLargeBoxesNum( template __mlu_global__ void MLUKernelRoipointPool3dLargeBoxesNum( const int batch_size, const int pts_num, const int boxes_num, const int feature_in_len, const int sampled_pts_num, - const char *points_xyz_gdram, const char *point_features_gdram, - const char *boxes3d_gdram, char *pooled_features_gdram, - char *pooled_empty_flag_gdram); + const int8_t *points_xyz_gdram, const int8_t *point_features_gdram, + const int8_t *boxes3d_gdram, int8_t *pooled_features_gdram, + int8_t *pooled_empty_flag_gdram); template __mlu_global__ void MLUKernelRoipointPool3dLargeBoxesNum( const int batch_size, const int pts_num, const int boxes_num, const int feature_in_len, const int sampled_pts_num, - const char *points_xyz_gdram, const char *point_features_gdram, - const char *boxes3d_gdram, char *pooled_features_gdram, - char *pooled_empty_flag_gdram); + const int8_t *points_xyz_gdram, const int8_t *point_features_gdram, + const int8_t *boxes3d_gdram, int8_t *pooled_features_gdram, + int8_t *pooled_empty_flag_gdram); mluOpStatus_t MLUOP_WIN_API KernelRoipointPool3dLargeBoxesNum( cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, mluOpDataType_t d_type, const int batch_size, const int pts_num, const int boxes_num, const int feature_in_len, const int sampled_pts_num, - const char *points_xyz_gdram, const char *point_features_gdram, - const char *boxes3d_gdram, char *pooled_features_gdram, - char *pooled_empty_flag_gdram) { + const int8_t *points_xyz_gdram, const int8_t *point_features_gdram, + const int8_t *boxes3d_gdram, int8_t *pooled_features_gdram, + int8_t *pooled_empty_flag_gdram) { switch (d_type) { /* Only float and half data types are supported in host-side CPP file fool-proof processing. */ diff --git a/kernels/rotated_feature_align/rotated_feature_align.cpp b/kernels/rotated_feature_align/rotated_feature_align.cpp index 15136c0c0..c383a3322 100644 --- a/kernels/rotated_feature_align/rotated_feature_align.cpp +++ b/kernels/rotated_feature_align/rotated_feature_align.cpp @@ -40,7 +40,7 @@ static void policyFunc(const mluOpHandle_t handle, const size_t num_bin = num_rois * pooled_height * pooled_width; size_t core_num = handle->core_num_per_cluster; size_t cluster_num = mluop::runtime::getJobLimitCapability(handle) / core_num; - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; k_dim->x = core_num; size_t use_cluster = (num_bin + core_num - 1) / core_num; k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster; diff --git a/kernels/rotated_feature_align/rotated_feature_align_block.mlu b/kernels/rotated_feature_align/rotated_feature_align_block.mlu index fc9cde722..58de5d8a5 100644 --- a/kernels/rotated_feature_align/rotated_feature_align_block.mlu +++ b/kernels/rotated_feature_align/rotated_feature_align_block.mlu @@ -29,7 +29,7 @@ #define SEG_NUM 10 #define BBOXES_ALIGN 64 -__nram__ char data_nram[MAX_NRAM_SIZE]; +__nram__ int8_t data_nram[MAX_NRAM_SIZE]; template __mlu_func__ void swap_ptr(T &a, T &b) { diff --git a/kernels/sparse_conv/get_indice_pairs/get_indice_pairs_block.mlu b/kernels/sparse_conv/get_indice_pairs/get_indice_pairs_block.mlu index 90cb6cf1b..21124ba89 100644 --- a/kernels/sparse_conv/get_indice_pairs/get_indice_pairs_block.mlu +++ b/kernels/sparse_conv/get_indice_pairs/get_indice_pairs_block.mlu @@ -36,7 +36,7 @@ __nram__ float filter_kd_index[KERNEL_V]; __nram__ float filter_kh_index[KERNEL_V]; __nram__ float filter_kw_index[KERNEL_V]; -__nram__ char nbuf_total[NRAM_LIMIT]; +__nram__ int8_t nbuf_total[NRAM_LIMIT]; __mlu_func__ void computeIndex(int32_t *nram_output, int32_t *nram_input, int32_t *nram_aux_a, float *nram_aux_b, @@ -44,7 +44,6 @@ __mlu_func__ void computeIndex(int32_t *nram_output, int32_t *nram_input, Dilation dilation, Padding padding, int32_t deal_num, int32_t step_index_start, int32_t k_dhw, int32_t batch) { -#if __BANG_ARCH__ >= 370 int32_t len_l_dim = deal_num * (Ndim + 1); int32_t deal_num_lk = deal_num * k_dhw; int32_t output_size = @@ -82,7 +81,6 @@ __mlu_func__ void computeIndex(int32_t *nram_output, int32_t *nram_input, output_size); __bang_transpose((int32_t *)nram_output + 2 * deal_num_lk, (int32_t *)nram_aux_a + 2 * deal_num_lk, deal_num, k_dhw); -#endif } __mlu_global__ void MLUBlockDefaultGetIndicePairKernel1( @@ -91,7 +89,6 @@ __mlu_global__ void MLUBlockDefaultGetIndicePairKernel1( InputSpace host_input_space, OutputSpace host_output_space, Stride host_stride, Dilation host_dilation, Padding host_padding, int32_t core_num_l, int32_t input_active_site, int32_t batch) { -#if __BANG_ARCH__ >= 370 /* nram_space |input| mask_all | indice_index_in | indice_out_expand | 4l + 3 k l |input| mask_all | indice_index_in | indice_out_expand | 4l + 3 k l @@ -127,7 +124,7 @@ __mlu_global__ void MLUBlockDefaultGetIndicePairKernel1( (int32_t *)indices_in + (offset_l_job + i * core_num_l) * Ndim; int32_t *nram_input_t = (int32_t *)nram_input + (i % 2) * ping_pong_num; int32_t deal_num = i == repeat - 1 ? rem_num_l : core_num_l; - __memcpy_async((char *)nram_input_t, (char *)indices_in_addr, + __memcpy_async((int8_t *)nram_input_t, (int8_t *)indices_in_addr, deal_num * Ndim * sizeof(int), GDRAM2NRAM); } if (1 <= i && i < (repeat + 1)) { @@ -147,30 +144,28 @@ __mlu_global__ void MLUBlockDefaultGetIndicePairKernel1( (offset_l_job + (i - 2) * core_num_l) * sizeof(int); int32_t *nram_output_t = (int32_t *)nram_output + ((i - 2) % 2) * ping_pong_num; - __memcpy_async((char *)mask_all_ws + gdram_offset, - (char *)(nram_output_t), deal_num * sizeof(int), + __memcpy_async((int8_t *)mask_all_ws + gdram_offset, + (int8_t *)(nram_output_t), deal_num * sizeof(int), NRAM2GDRAM, input_active_site * sizeof(int), deal_num * sizeof(int), k_dhw - 1); - __memcpy_async((char *)indice_index_in_ws + gdram_offset, - (char *)(nram_output_t + deal_num * k_dhw), + __memcpy_async((int8_t *)indice_index_in_ws + gdram_offset, + (int8_t *)(nram_output_t + deal_num * k_dhw), deal_num * sizeof(int), NRAM2GDRAM, input_active_site * sizeof(int), deal_num * sizeof(int), k_dhw - 1); - __memcpy_async((char *)indice_out_expand_ws + gdram_offset, - (char *)(nram_output_t + 2 * deal_num * k_dhw), + __memcpy_async((int8_t *)indice_out_expand_ws + gdram_offset, + (int8_t *)(nram_output_t + 2 * deal_num * k_dhw), deal_num * sizeof(int), NRAM2GDRAM, input_active_site * sizeof(int), deal_num * sizeof(int), k_dhw - 1); } __sync(); } -#endif } __mlu_global__ void MLUBlockDefaultGetIndicePairKernel2(void *index_output_ptr, int32_t num_act_out, int32_t core_num_l) { -#if __BANG_ARCH__ >= 370 int32_t len_job = 0, offset_job = 0; assignTask(num_act_out, taskIdY, taskDimY, offset_job, len_job); int32_t repeat = (len_job + core_num_l - 1) / core_num_l; @@ -182,16 +177,15 @@ __mlu_global__ void MLUBlockDefaultGetIndicePairKernel2(void *index_output_ptr, int32_t length = i == (repeat - 1) ? rem_num_l : core_num_l; stepIndex((int32_t *)nram_input, start_index, length); // sync int32_t *output_ptr = (int32_t *)index_output_ptr + start_index; - __memcpy((char *)output_ptr, nram_input, length * sizeof(int), NRAM2GDRAM); + __memcpy((int8_t *)output_ptr, nram_input, length * sizeof(int), + NRAM2GDRAM); } -#endif } __mlu_global__ void MLUBlockBalanceGetIndicePairKernel( void *balance_input, void *balance_mask, void *balance_output, int32_t len_l, int32_t kernel_volume, int32_t core_num_l, int32_t output_size) { -#if __BANG_ARCH__ >= 370 int32_t len_job, offset_job = 0; assignTask(len_l * kernel_volume, taskIdY, taskDimY, offset_job, len_job); int32_t repeat = (len_job + core_num_l - 1) / core_num_l; @@ -214,9 +208,9 @@ __mlu_global__ void MLUBlockBalanceGetIndicePairKernel( (int32_t *)balance_mask + offset_job + i * core_num_l; int32_t *nram_input_t = (int32_t *)nram_input + (i % 2) * ping_pong_num; int32_t *nram_mask_t = (int32_t *)nram_mask + (i % 2) * ping_pong_num; - __memcpy_async((char *)nram_input_t, (char *)balance_input_ptr, + __memcpy_async((int8_t *)nram_input_t, (int8_t *)balance_input_ptr, deal_num * sizeof(int), GDRAM2NRAM); - __memcpy_async((char *)nram_mask_t, (char *)balance_mask_ptr, + __memcpy_async((int8_t *)nram_mask_t, (int8_t *)balance_mask_ptr, deal_num * sizeof(int), GDRAM2NRAM); } if (1 <= i && i <= repeat) { @@ -229,12 +223,12 @@ __mlu_global__ void MLUBlockBalanceGetIndicePairKernel( (int32_t *)nram_output + ((i - 1) % 2) * ping_pong_num; __bang_mul_scalar((int32_t *)nram_aux, (int32_t *)nram_mask_t, int(-1), deal_num); - __bang_band((char *)nram_output_t, (char *)nram_input_t, (char *)nram_aux, - deal_num * sizeof(int)); + __bang_band((int8_t *)nram_output_t, (int8_t *)nram_input_t, + (int8_t *)nram_aux, deal_num * sizeof(int)); __bang_sub_scalar((int32_t *)nram_aux, (int32_t *)nram_mask_t, int(1), deal_num); - __bang_band((char *)nram_aux, (char *)nram_aux, (char *)nram_random_num, - deal_num * sizeof(int)); + __bang_band((int8_t *)nram_aux, (int8_t *)nram_aux, + (int8_t *)nram_random_num, deal_num * sizeof(int)); __bang_add((int32_t *)nram_output_t, (int32_t *)nram_output_t, (int32_t *)nram_aux, deal_num); } @@ -243,18 +237,17 @@ __mlu_global__ void MLUBlockBalanceGetIndicePairKernel( uint64_t gdram_offset = (offset_job + (i - 2) * core_num_l) * sizeof(int); int32_t *nram_output_t = (int32_t *)nram_output + ((i - 2) % 2) * ping_pong_num; - __memcpy_async((char *)balance_output + gdram_offset, - (char *)nram_output_t, deal_num * sizeof(int), NRAM2GDRAM); + __memcpy_async((int8_t *)balance_output + gdram_offset, + (int8_t *)nram_output_t, deal_num * sizeof(int), + NRAM2GDRAM); } __sync(); } -#endif } __mlu_global__ void MLUBlockDefaultGetIndicePairKernel3( void *indice_pair, void *indice_index_ptr, void *mask_all, int32_t len_l, int32_t kernel_volume, int32_t core_num_l) { -#if __BANG_ARCH__ >= 370 int32_t len_l_job = 0, offset_l_job = 0; assignTask(2 * kernel_volume, taskIdY, taskDimY, offset_l_job, len_l_job); float *nram_input = (float *)nbuf_total; @@ -290,18 +283,16 @@ __mlu_global__ void MLUBlockDefaultGetIndicePairKernel3( (int32_t *)indice_pair + store_offset * len_l + core_offset_l_valid; core_offset_l_valid += valid_l_num_now; if (valid_l_num_now > 0) { - __memcpy((char *)store_valid_ptr, (char *)nram_output, + __memcpy((int8_t *)store_valid_ptr, (int8_t *)nram_output, valid_l_num_now * sizeof(int32_t), NRAM2GDRAM); } } } -#endif } __mlu_global__ void MLUBlockDefaultGetIndicePairKernel4( void *indice_out, void *input_ptr, OutputSpace host_output_space, int32_t len_l, int32_t core_num_l) { -#if __BANG_ARCH__ >= 370 OutputSpace output_space = host_output_space; int32_t len_l_job = 0, offset_l_job = 0; assignTask(len_l, taskIdY, taskDimY, offset_l_job, len_l_job); @@ -319,7 +310,7 @@ __mlu_global__ void MLUBlockDefaultGetIndicePairKernel4( int32_t load_num_l = i == (repeat - 1) ? rem_num_l : core_num_l; int32_t *input_start_ptr = input_start_core + i * core_num_l; int32_t *nram_input_load = nram_input + (i % 2) * ping_pong_num; - __memcpy_async((char *)nram_input_load, (char *)input_start_ptr, + __memcpy_async((int8_t *)nram_input_load, (int8_t *)input_start_ptr, load_num_l * sizeof(int32_t), GDRAM2NRAM); } if (1 <= i && i < (repeat + 1)) { @@ -334,12 +325,11 @@ __mlu_global__ void MLUBlockDefaultGetIndicePairKernel4( int32_t *nram_output_t = nram_output + ((i - 2) % 2) * ping_pong_num; int32_t *indice_out_t = (int32_t *)indice_out + (offset_l_job + (i - 2) * core_num_l) * 4; - __memcpy_async((char *)indice_out_t, (char *)nram_output_t, + __memcpy_async((int8_t *)indice_out_t, (int8_t *)nram_output_t, load_num_l * 4 * sizeof(int32_t), NRAM2GDRAM); } __sync(); } -#endif } __mlu_global__ void MLUBlockSubmGetIndicePairKernel1( @@ -349,7 +339,6 @@ __mlu_global__ void MLUBlockSubmGetIndicePairKernel1( OutputSpace host_output_space, Stride host_stride, Dilation host_dilation, Padding host_padding, int32_t core_num_l, int32_t input_active_site, int32_t batch) { -#if __BANG_ARCH__ >= 370 /* nram_space |input| mask_all | indice_index_in | indice_out_expand | indice_in_expand | 4l + l + 3kl |input| mask_all | indice_index_in | indice_out_expand | @@ -385,7 +374,7 @@ __mlu_global__ void MLUBlockSubmGetIndicePairKernel1( (float *)indices_in + (offset_l_job + i * core_num_l) * Ndim; int32_t *nram_input_t = (int32_t *)nram_input + (i % 2) * ping_pong_num; int32_t deal_num = i == repeat - 1 ? rem_num_l : core_num_l; - __memcpy_async((char *)nram_input_t, (char *)indices_in_addr, + __memcpy_async((int8_t *)nram_input_t, (int8_t *)indices_in_addr, deal_num * Ndim * sizeof(int), GDRAM2NRAM); } if (1 <= i && i < (repeat + 1)) { @@ -407,34 +396,32 @@ __mlu_global__ void MLUBlockSubmGetIndicePairKernel1( (offset_l_job + (i - 2) * core_num_l) * sizeof(int32_t); int32_t *nram_output_t = (int32_t *)nram_output + ((i - 2) % 2) * ping_pong_num; - __memcpy_async((char *)mask_all_ptr + gdram_offset, - (char *)(nram_output_t), deal_num * sizeof(int), + __memcpy_async((int8_t *)mask_all_ptr + gdram_offset, + (int8_t *)(nram_output_t), deal_num * sizeof(int), NRAM2GDRAM, input_active_site * sizeof(int), deal_num * sizeof(int32_t), k_dhw - 1); - __memcpy_async((char *)indice_index_in_ptr + gdram_offset, - (char *)(nram_output_t + deal_num * k_dhw), + __memcpy_async((int8_t *)indice_index_in_ptr + gdram_offset, + (int8_t *)(nram_output_t + deal_num * k_dhw), deal_num * sizeof(int), NRAM2GDRAM, input_active_site * sizeof(int), deal_num * sizeof(int32_t), k_dhw - 1); - __memcpy_async((char *)indice_out_expand_ptr + gdram_offset, - (char *)(nram_output_t + 2 * deal_num * k_dhw), + __memcpy_async((int8_t *)indice_out_expand_ptr + gdram_offset, + (int8_t *)(nram_output_t + 2 * deal_num * k_dhw), deal_num * sizeof(int), NRAM2GDRAM, input_active_site * sizeof(int), deal_num * sizeof(int32_t), k_dhw - 1); - __memcpy_async((char *)indice_in_expand_ptr + gdram_offset, - (char *)(nram_output_t + 3 * deal_num * k_dhw), + __memcpy_async((int8_t *)indice_in_expand_ptr + gdram_offset, + (int8_t *)(nram_output_t + 3 * deal_num * k_dhw), deal_num * sizeof(int), NRAM2GDRAM); } __sync(); } -#endif } __mlu_global__ void MLUBlockSubmGetIndicePairKernel2( void *indice_out, void *mask_all_ptr, void *indice_out_index_ptr, void *indices_in, int32_t len_1_one, int32_t len_l_two, int32_t core_num_1_one, int32_t core_num_l_two) { -#if __BANG_ARCH__ >= 370 int32_t len_job = 0, offset_job = 0; assignTask(len_1_one, taskIdY, taskDimY, offset_job, len_job); int32_t repeat = (len_job + core_num_1_one - 1) / core_num_1_one; @@ -447,9 +434,9 @@ __mlu_global__ void MLUBlockSubmGetIndicePairKernel2( for (int i = 0; i < repeat; ++i) { int32_t offset = i * core_num_1_one; int32_t deal_num = i == repeat - 1 ? rem_num_l : core_num_1_one; - __memcpy_async((char *)nram_input, (char *)(indices_in_offset + offset), + __memcpy_async((int8_t *)nram_input, (int8_t *)(indices_in_offset + offset), deal_num * bit_width, GDRAM2NRAM); - __memcpy_async((char *)(indice_out_offset + offset), (char *)nram_input, + __memcpy_async((int8_t *)(indice_out_offset + offset), (int8_t *)nram_input, deal_num * bit_width, NRAM2GDRAM); } @@ -464,19 +451,18 @@ __mlu_global__ void MLUBlockSubmGetIndicePairKernel2( for (int i = 0; i < repeat; ++i) { int32_t offset = i * core_num_l_two; int32_t deal_num = i == repeat - 1 ? rem_num_l : core_num_l_two; - __memcpy((char *)nram_input, (char *)(mask_all_ptr_offset + offset), + __memcpy((int8_t *)nram_input, (int8_t *)(mask_all_ptr_offset + offset), deal_num * bit_width, GDRAM2NRAM); - __memcpy((char *)nram_output, - (char *)(indice_out_index_ptr_offset + offset), + __memcpy((int8_t *)nram_output, + (int8_t *)(indice_out_index_ptr_offset + offset), deal_num * bit_width, GDRAM2NRAM); __bang_ge_scalar((int32_t *)nram_output, (int32_t *)nram_output, (int)0, deal_num); __bang_and((int32_t *)nram_output, (int32_t *)nram_output, (int32_t *)nram_input, deal_num); - __memcpy((char *)(mask_all_ptr_offset + offset), (char *)nram_output, + __memcpy((int8_t *)(mask_all_ptr_offset + offset), (int8_t *)nram_output, deal_num * bit_width, NRAM2GDRAM); } -#endif } mluOpStatus_t MLUOP_WIN_API KernelDefaultGetIndicePairKl1( diff --git a/kernels/sparse_conv/get_indice_pairs/get_indice_pairs_utils.h b/kernels/sparse_conv/get_indice_pairs/get_indice_pairs_utils.h index 02438bd2c..52a135c7f 100644 --- a/kernels/sparse_conv/get_indice_pairs/get_indice_pairs_utils.h +++ b/kernels/sparse_conv/get_indice_pairs/get_indice_pairs_utils.h @@ -29,7 +29,6 @@ #include "kernels/sparse_conv/get_indice_pairs/normal_get_indice_pairs.h" #include "kernels/kernel.h" -#if __BANG_ARCH__ >= 370 __mlu_func__ void assignTask(const int32_t num_total_task, const int32_t &taskid, const int32_t &taskdim, int32_t &task_offset, int32_t &num_cur_task) { @@ -245,13 +244,13 @@ __mlu_func__ void genIndiceOutExpand(int32_t *nram_output, int32_t *mask_all, int32_t *nram_input, int32_t *temp, int32_t deal_num, int32_t output_size) { __bang_mul_scalar((int32_t *)temp, (int32_t *)mask_all, int(-1), deal_num); - __bang_band((char *)nram_output, (char *)nram_input, (char *)temp, + __bang_band((int8_t *)nram_output, (int8_t *)nram_input, (int8_t *)temp, deal_num * sizeof(int32_t)); // clost to intmax __bang_sub_scalar((int32_t *)temp, (int32_t *)mask_all, int(1), deal_num); __bang_mul_scalar((int32_t *)temp, (int32_t *)temp, int(-1 * output_size), deal_num); - __bang_bor((char *)nram_output, (char *)nram_output, (char *)temp, + __bang_bor((int8_t *)nram_output, (int8_t *)nram_output, (int8_t *)temp, deal_num * sizeof(int32_t)); } @@ -344,5 +343,4 @@ __mlu_func__ void genIndiceInExpand(int32_t *nram_output, int32_t *nram_input, __bang_add((int32_t *)nram_output, (int32_t *)nram_output, (int32_t *)nram_aux + 4 * deal_num, deal_num); } -#endif #endif // KERNELS_GET_INDICE_PAIRS_GET_INDICE_PAIRS_UTILS_H_ diff --git a/kernels/sparse_conv/get_indice_pairs/normal_get_indice_pairs.cpp b/kernels/sparse_conv/get_indice_pairs/normal_get_indice_pairs.cpp index ca2fecd21..fd77f94f6 100644 --- a/kernels/sparse_conv/get_indice_pairs/normal_get_indice_pairs.cpp +++ b/kernels/sparse_conv/get_indice_pairs/normal_get_indice_pairs.cpp @@ -49,9 +49,8 @@ static mluOpStatus_t getIndiceIndexIn( const mluOpTensorDescriptor_t indice_pairs_desc, const int kernel_volume, const int input_active_site, size_t *size) { size_t total_size = 0; - total_size = - kernel_volume * input_active_site * - mluop::getSizeOfDataType(indice_pairs_desc->dtype); + total_size = kernel_volume * input_active_site * + mluop::getSizeOfDataType(indice_pairs_desc->dtype); size[0] = total_size; return MLUOP_STATUS_SUCCESS; } @@ -60,9 +59,8 @@ static mluOpStatus_t getIndiceIndexOut( const mluOpTensorDescriptor_t indice_pairs_desc, const int kernel_volume, const int input_active_site, size_t *size) { size_t total_size = 0; - total_size = - kernel_volume * input_active_site * - mluop::getSizeOfDataType(indice_pairs_desc->dtype); + total_size = kernel_volume * input_active_site * + mluop::getSizeOfDataType(indice_pairs_desc->dtype); size[0] = total_size; return MLUOP_STATUS_SUCCESS; } @@ -71,9 +69,8 @@ static mluOpStatus_t getIndiceOutExpand( const mluOpTensorDescriptor_t indice_pairs_desc, const int kernel_volume, const int input_active_site, size_t *size) { size_t total_size = 0; - total_size = - kernel_volume * input_active_site * - mluop::getSizeOfDataType(indice_pairs_desc->dtype); + total_size = kernel_volume * input_active_site * + mluop::getSizeOfDataType(indice_pairs_desc->dtype); size[0] = total_size; return MLUOP_STATUS_SUCCESS; } @@ -82,8 +79,8 @@ static mluOpStatus_t getIndiceInExpand( const mluOpTensorDescriptor_t indice_pairs_desc, const int input_active_site, size_t *size) { size_t total_size = 0; - total_size = input_active_site * - mluop::getSizeOfDataType(indice_pairs_desc->dtype); + total_size = + input_active_site * mluop::getSizeOfDataType(indice_pairs_desc->dtype); size[0] = total_size; return MLUOP_STATUS_SUCCESS; } @@ -285,7 +282,7 @@ mluOpStatus_t launchDefaultKernel1( int core_num_l = (nram_size - 4 * 4096 * 3) / nums / sizeof(int); int jobs = (input_active_site + core_num_l - 1) / core_num_l; int job_num = jobs > core_nums ? core_nums : jobs; - func_type = CNRT_FUNC_TYPE_BLOCK; + func_type = cnrtFuncTypeBlock; kDim3.x = 1; kDim3.y = job_num; kDim3.z = 1; @@ -351,7 +348,7 @@ mluOpStatus_t launchSubmKernel1( int least_jobs = (input_active_site * sizeof(int) + 1024 - 1) / 1024; jobs = std::max(jobs, least_jobs); int job_num = jobs > core_nums ? core_nums : jobs; - func_type = CNRT_FUNC_TYPE_BLOCK; + func_type = cnrtFuncTypeBlock; kDim3.x = 1; kDim3.y = job_num; kDim3.z = 1; @@ -419,7 +416,7 @@ mluOpStatus_t launchSubmKernel2(mluOpHandle_t handle, const void *indices, int least_jobs = std::max(least_job_one, least_job_two); int jobs = std::max(std::max(jobs_one, jobs_two), least_jobs); int job_num = jobs > core_nums ? core_nums : jobs; - func_type = CNRT_FUNC_TYPE_BLOCK; + func_type = cnrtFuncTypeBlock; kDim3.x = 1; kDim3.y = job_num; kDim3.z = 1; @@ -532,7 +529,7 @@ mluOpStatus_t launchUniqueOp(mluOpHandle_t handle, } cnrtQueueSync(handle->queue); cnrtMemcpy(return_num_act, unique_output_num_addr, sizeof(float), - CNRT_MEM_TRANS_DIR_DEV2HOST); + cnrtMemcpyDevToHost); CHECK_RETURN(interface_name, mluOpDestroyTensorDescriptor(unique_input_desc)); CHECK_RETURN(interface_name, mluOpDestroyTensorDescriptor(unique_output_desc)); @@ -557,7 +554,7 @@ mluOpStatus_t launchDefaultKernel2(mluOpHandle_t handle, int core_num_l = (nram_size - 4 * 4096 * 3) / sizeof(int); int jobs = (num_act_out + core_num_l - 1) / core_num_l; int job_num = jobs > core_nums ? core_nums : jobs; - func_type = CNRT_FUNC_TYPE_BLOCK; + func_type = cnrtFuncTypeBlock; kDim3.x = 1; kDim3.y = job_num; kDim3.z = 1; @@ -595,7 +592,7 @@ mluOpStatus_t launchBalanceKernel(mluOpHandle_t handle, int core_num_l = (nram_size - 4 * 4096 * 3) / 8 / sizeof(int); int jobs = (input_active_site * kernel_volume + core_num_l - 1) / core_num_l; int job_num = jobs > core_nums ? core_nums : jobs; - func_type = CNRT_FUNC_TYPE_BLOCK; + func_type = cnrtFuncTypeBlock; kDim3.x = 1; kDim3.y = job_num; kDim3.z = 1; @@ -770,7 +767,7 @@ mluOpStatus_t launchDefaultKernel3(mluOpHandle_t handle, void *output_addr, int core_num_l = (nram_size - 4 * 4096 * 3) / 4 / sizeof(int); int jobs = 2 * kernel_volume; int job_num = jobs > core_nums ? core_nums : jobs; - func_type = CNRT_FUNC_TYPE_BLOCK; + func_type = cnrtFuncTypeBlock; kDim3.x = 1; kDim3.y = job_num; kDim3.z = 1; @@ -810,7 +807,7 @@ mluOpStatus_t launchDefaultKernel4( int core_num_l = (nram_size - 4 * 4096 * 3) / core_num_split / sizeof(int); int jobs = (num_act_out + core_num_l - 1) / core_num_l; int job_num = jobs > core_nums ? core_nums : jobs; - func_type = CNRT_FUNC_TYPE_BLOCK; + func_type = cnrtFuncTypeBlock; kDim3.x = 1; kDim3.y = job_num; kDim3.z = 1; @@ -875,13 +872,13 @@ mluOpStatus_t NormalGetIndicePairsKernel( getReduceOpWS(handle, interface_name, kernel_volume, input_active_site, &reduce_op_ws)); const void *compute_indices_ptr = indices; - void *mask_all_ptr = (void *)((char *)workspace); - void *indice_index_in_ptr = (void *)((char *)workspace + mask_all_ws); + void *mask_all_ptr = (void *)((int8_t *)workspace); + void *indice_index_in_ptr = (void *)((int8_t *)workspace + mask_all_ws); void *indice_in_expand_ptr = - (void *)((char *)workspace + mask_all_ws + indice_index_in_ws + + (void *)((int8_t *)workspace + mask_all_ws + indice_index_in_ws + indice_index_out_ws); void *out_indices_expand_ptr = - (void *)((char *)workspace + mask_all_ws + indice_index_in_ws + + (void *)((int8_t *)workspace + mask_all_ws + indice_index_in_ws + indice_index_out_ws + indice_in_expand_ws); CHECK_RETURN( interface_name, @@ -892,8 +889,8 @@ mluOpStatus_t NormalGetIndicePairsKernel( // call launchDefaultKernel2 gen step_index void *step_index_addr = NULL; - step_index_addr = - (void *)((char *)(char *)workspace + mask_all_ws + indice_index_in_ws); + step_index_addr = (void *)((int8_t *)(int8_t *)workspace + mask_all_ws + + indice_index_in_ws); CHECK_RETURN(interface_name, launchDefaultKernel2(handle, step_index_addr, input_active_site)); @@ -902,7 +899,7 @@ mluOpStatus_t NormalGetIndicePairsKernel( *scatter_indice_addr = NULL; scatter_input_addr = step_index_addr; scatter_indice_addr = indice_in_expand_ptr; - scatter_output_addr = (void *)((char *)workspace + mask_all_ws + + scatter_output_addr = (void *)((int8_t *)workspace + mask_all_ws + indice_index_in_ws + indice_index_out_ws + indice_in_expand_ws + out_indices_expand_ws); int fill_value = -1; @@ -918,7 +915,7 @@ mluOpStatus_t NormalGetIndicePairsKernel( void *gather_input_addr = NULL, *gather_output_addr = NULL, *gather_indice_addr = NULL; gather_output_addr = - (void *)((char *)workspace + mask_all_ws + indice_index_in_ws); + (void *)((int8_t *)workspace + mask_all_ws + indice_index_in_ws); gather_input_addr = scatter_output_addr; gather_indice_addr = out_indices_expand_ptr; CHECK_RETURN( @@ -949,7 +946,7 @@ mluOpStatus_t NormalGetIndicePairsKernel( void *reduce_workspace_ptr = NULL; if (reduce_op_ws > 0) { reduce_workspace_ptr = - (void *)((char *)workspace + mask_all_ws + indice_index_in_ws + + (void *)((int8_t *)workspace + mask_all_ws + indice_index_in_ws + indice_index_out_ws + indice_in_expand_ws + out_indices_expand_ws); } @@ -1009,10 +1006,10 @@ mluOpStatus_t NormalGetIndicePairsKernel( getUniqueOpWS(handle, interface_name, indices_desc, kernel_volume, input_active_site, &unique_op_ws)); const void *compute_indices_ptr = indices; - void *mask_all_ptr = (void *)((char *)workspace); - void *indice_index_in_ptr = (void *)((char *)workspace + mask_all_ws); + void *mask_all_ptr = (void *)((int8_t *)workspace); + void *indice_index_in_ptr = (void *)((int8_t *)workspace + mask_all_ws); void *out_indices_expand_ptr = - (void *)((char *)workspace + mask_all_ws + indice_index_out_ws + + (void *)((int8_t *)workspace + mask_all_ws + indice_index_out_ws + indice_index_in_ws); CHECK_RETURN(interface_name, launchDefaultKernel1( @@ -1026,7 +1023,7 @@ mluOpStatus_t NormalGetIndicePairsKernel( reduce_output_addr = indice_num; void *reduce_workspace_ptr = NULL; if (reduce_op_ws > 0) { - reduce_workspace_ptr = (void *)((char *)workspace + mask_all_ws + + reduce_workspace_ptr = (void *)((int8_t *)workspace + mask_all_ws + indice_index_in_ws + indice_index_out_ws + out_indices_expand_ws + indice_unique_ws); } @@ -1042,13 +1039,13 @@ mluOpStatus_t NormalGetIndicePairsKernel( *unique_output_num_addr = NULL; unique_input_addr = out_indices_expand_ptr; unique_output_addr = - (void *)((char *)workspace + mask_all_ws + indice_index_in_ws + + (void *)((int8_t *)workspace + mask_all_ws + indice_index_in_ws + indice_index_out_ws + out_indices_expand_ws); unique_output_num_addr = - (void *)((char *)workspace + mask_all_ws + indice_index_in_ws); + (void *)((int8_t *)workspace + mask_all_ws + indice_index_in_ws); void *unique_workspace_ptr = NULL; if (unique_op_ws > 0) { - unique_workspace_ptr = (void *)((char *)workspace + mask_all_ws + + unique_workspace_ptr = (void *)((int8_t *)workspace + mask_all_ws + indice_index_in_ws + indice_index_out_ws + out_indices_expand_ws + indice_unique_ws); } @@ -1079,7 +1076,7 @@ mluOpStatus_t NormalGetIndicePairsKernel( // call launchDefaultKernel2 gen step_index void *step_index_addr = NULL; step_index_addr = - (void *)((char *)workspace + mask_all_ws + indice_index_in_ws); + (void *)((int8_t *)workspace + mask_all_ws + indice_index_in_ws); CHECK_RETURN(interface_name, launchDefaultKernel2(handle, step_index_addr, num_act_out)); @@ -1100,7 +1097,7 @@ mluOpStatus_t NormalGetIndicePairsKernel( *scatter_indice_addr = NULL; scatter_input_addr = step_index_addr; scatter_indice_addr = unique_output_addr; - scatter_output_addr = (void *)((char *)workspace + mask_all_ws + + scatter_output_addr = (void *)((int8_t *)workspace + mask_all_ws + indice_index_in_ws + indice_index_out_ws + out_indices_expand_ws + indice_unique_ws); int fill_value = -1; @@ -1116,7 +1113,7 @@ mluOpStatus_t NormalGetIndicePairsKernel( void *gather_input_addr = NULL, *gather_output_addr = NULL, *gather_indice_addr = NULL; gather_output_addr = - (void *)((char *)workspace + mask_all_ws + indice_index_in_ws); + (void *)((int8_t *)workspace + mask_all_ws + indice_index_in_ws); gather_input_addr = scatter_output_addr; gather_indice_addr = out_indices_expand_ptr; CHECK_RETURN( diff --git a/kernels/sparse_conv/indice_convolution_backward_data/indice_convolution_backward_data.cpp b/kernels/sparse_conv/indice_convolution_backward_data/indice_convolution_backward_data.cpp index 024885a84..2f714d1b8 100644 --- a/kernels/sparse_conv/indice_convolution_backward_data/indice_convolution_backward_data.cpp +++ b/kernels/sparse_conv/indice_convolution_backward_data/indice_convolution_backward_data.cpp @@ -621,14 +621,14 @@ mluOpStatus_t MLUOP_WIN_API mluOpIndiceConvolutionBackwardData( getMaxNumInArray(indice_num, K) * output_grad_desc->dims[1] * cal_dwidth; input_grad_condence_size = getMaxNumInArray(indice_num, K) * input_grad_desc->dims[1] * cal_dwidth; - char *filter_transpose = (char *)filters; - char *workspace_base = (char *)workspace; + int8_t *filter_transpose = (int8_t *)filters; + int8_t *workspace_base = (int8_t *)workspace; // transpose filters to layout XHWCN mluOpTensorDescriptor_t filter_transpose_desc; if (filters_desc->layout != MLUOP_LAYOUT_HWCN && filters_desc->layout != MLUOP_LAYOUT_ARRAY) { - filter_transpose = (char *)workspace; + filter_transpose = (int8_t *)workspace; workspace_base += filter_transpose_size; cnnlTransposeDescriptor_t trans_desc; CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&filter_transpose_desc)); @@ -655,7 +655,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpIndiceConvolutionBackwardData( DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc); DESTROY_CNNL_HANDLE(cnnl_handle); } - char *transpose_workspace = workspace_base; + int8_t *transpose_workspace = workspace_base; workspace_base += transpose_workspace_size; { DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); @@ -674,9 +674,9 @@ mluOpStatus_t MLUOP_WIN_API mluOpIndiceConvolutionBackwardData( } else { filter_transpose_desc = filters_desc; } - char *output_grad_condence = workspace_base; + int8_t *output_grad_condence = workspace_base; workspace_base += output_grad_condence_size; - char *input_grad_condence = workspace_base; + int8_t *input_grad_condence = workspace_base; workspace_base += input_grad_condence_size; // filters calculate desc @@ -696,8 +696,8 @@ mluOpStatus_t MLUOP_WIN_API mluOpIndiceConvolutionBackwardData( DESTROY_CNNL_HANDLE(cnnl_handle); void *workspace_matmul = NULL; - char *workspace_input_grad_tmp = NULL; - char *workspace_addn = NULL; + int8_t *workspace_input_grad_tmp = NULL; + int8_t *workspace_addn = NULL; // filters DHW dim loop int kk_count = 0; @@ -707,7 +707,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpIndiceConvolutionBackwardData( continue; } const int int_dwidth = 4; - char *sub_filter = filter_transpose + kk * dyc * dxc * cal_dwidth; + int8_t *sub_filter = filter_transpose + kk * dyc * dxc * cal_dwidth; // gather output_grad mluOpTensorDescriptor_t gather_indices_desc; @@ -726,8 +726,8 @@ mluOpStatus_t MLUOP_WIN_API mluOpIndiceConvolutionBackwardData( output_grad_condence_dims)); uint64_t gather_indices_offset = (kk * 2 + 1) * int(indice_pairs_desc->dims[2]) * int_dwidth; - char *gather_indices = - (char *)(const_cast(indice_pairs)) + gather_indices_offset; + int8_t *gather_indices = + (int8_t *)(const_cast(indice_pairs)) + gather_indices_offset; { DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_grad_desc, @@ -849,8 +849,8 @@ mluOpStatus_t MLUOP_WIN_API mluOpIndiceConvolutionBackwardData( // scatter input_grad uint64_t scatter_indices_offset = (kk * 2) * int(indice_pairs_desc->dims[2]) * int_dwidth; - char *scatter_indices = - (char *)(const_cast(indice_pairs)) + scatter_indices_offset; + int8_t *scatter_indices = + (int8_t *)(const_cast(indice_pairs)) + scatter_indices_offset; { DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(gather_indices_desc, diff --git a/kernels/sparse_conv/indice_convolution_backward_data/indice_convolution_backward_data.h b/kernels/sparse_conv/indice_convolution_backward_data/indice_convolution_backward_data.h index e8be23270..01dd51e72 100644 --- a/kernels/sparse_conv/indice_convolution_backward_data/indice_convolution_backward_data.h +++ b/kernels/sparse_conv/indice_convolution_backward_data/indice_convolution_backward_data.h @@ -33,4 +33,4 @@ inline int getMaxNumInArray(const int64_t arr[], const int num) { return max_num; } -#endif // KERNELS_INDICE_CONVOLUTION_BACKWARD_DATA_INDICE_CONVOLUTION_BACKWARD_DATA_H_ // NOLINT +#endif // KERNELS_INDICE_CONVOLUTION_BACKWARD_DATA_INDICE_CONVOLUTION_BACKWARD_DATA_H_ // NOLINT diff --git a/kernels/sparse_conv/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp b/kernels/sparse_conv/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp index 0dc6440ad..1a464735b 100644 --- a/kernels/sparse_conv/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp +++ b/kernels/sparse_conv/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp @@ -346,9 +346,9 @@ static mluOpStatus_t internalIndiceConvBackwardFilter( filters_grad_need_trans ? filters_grad_desc->total_tensor_size : 0; void *filters_grad_temp = filters_grad_need_trans ? workspace : filters_grad; - void *input_temp = (char *)workspace + filters_grad_trans_size; - void *diffy_temp = (char *)input_temp + max_input_size; - void *matmul_ws = (char *)diffy_temp + max_diffy_size; + void *input_temp = (int8_t *)workspace + filters_grad_trans_size; + void *diffy_temp = (int8_t *)input_temp + max_input_size; + void *matmul_ws = (int8_t *)diffy_temp + max_diffy_size; // create temp tensor for gather and matmul mluOpTensorDescriptor_t active_indice_desc; @@ -433,10 +433,11 @@ static mluOpStatus_t internalIndiceConvBackwardFilter( matmul_ws_size = temp_matmul_size > matmul_ws_size ? temp_matmul_size : matmul_ws_size; } else { - void *filters_grad_buffer = (char *)filters_grad_temp + i * cico_size; - void *gather_input_indice = (char *)indice_pairs + i * 2 * pair_low_size; + void *filters_grad_buffer = (int8_t *)filters_grad_temp + i * cico_size; + void *gather_input_indice = + (int8_t *)indice_pairs + i * 2 * pair_low_size; void *gather_output_grad = - (char *)indice_pairs + i * 2 * pair_low_size + pair_low_size; + (int8_t *)indice_pairs + i * 2 * pair_low_size + pair_low_size; // gather activate input data [n, ci] { DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); diff --git a/kernels/sparse_conv/indice_convolution_forward/indice_convolution_forward.cpp b/kernels/sparse_conv/indice_convolution_forward/indice_convolution_forward.cpp index e2d3c5b89..5022875a5 100644 --- a/kernels/sparse_conv/indice_convolution_forward/indice_convolution_forward.cpp +++ b/kernels/sparse_conv/indice_convolution_forward/indice_convolution_forward.cpp @@ -229,12 +229,12 @@ static mluOpStatus_t mainIndiceConvolutionForward( // allocate workspace segment for intermediate data void *validFilters_ptr = filters_need_trans ? workspace : (void *)filters; - void *transposeExtra_ptr = (char *)workspace + workspaceSize_transpose; - void *matmulResult_ptr = (char *)workspace + workspaceSize_transpose; - void *gatherResult_ptr = (char *)matmulResult_ptr + workspaceSize_matmul; - void *matmulExtra_ptr = (char *)gatherResult_ptr + workspaceSize_gather; - void *scatterResult_ptr = (char *)matmulResult_ptr + workspaceSize_matmul; - void *addNExtra_ptr = (char *)scatterResult_ptr + workspaceSize_scatter; + void *transposeExtra_ptr = (int8_t *)workspace + workspaceSize_transpose; + void *matmulResult_ptr = (int8_t *)workspace + workspaceSize_transpose; + void *gatherResult_ptr = (int8_t *)matmulResult_ptr + workspaceSize_matmul; + void *matmulExtra_ptr = (int8_t *)gatherResult_ptr + workspaceSize_gather; + void *scatterResult_ptr = (int8_t *)matmulResult_ptr + workspaceSize_matmul; + void *addNExtra_ptr = (int8_t *)scatterResult_ptr + workspaceSize_scatter; void *addN_ptrs[2] = {scatterResult_ptr, features_out}; // create intermediate tensor @@ -407,11 +407,12 @@ static mluOpStatus_t mainIndiceConvolutionForward( ? tempSize_addNExtra : workspaceSize_addNExtra; } else { - void *filters_buffer = (char *)validFilters_ptr + i * elementSize_filters; + void *filters_buffer = + (int8_t *)validFilters_ptr + i * elementSize_filters; void *gatherIndice_buffer = - (char *)indice_pairs + i * 2 * elementSize_indice_pairs; + (int8_t *)indice_pairs + i * 2 * elementSize_indice_pairs; void *scatterAddIndice_buffer = - (char *)indice_pairs + (i * 2 + 1) * elementSize_indice_pairs; + (int8_t *)indice_pairs + (i * 2 + 1) * elementSize_indice_pairs; // invoke gather to get input data: // [num_act_in, ci] -> [indice_pairs_num[i], ci] { diff --git a/kernels/sqrt/sqrt_union1.mlu b/kernels/sqrt/sqrt_union1.mlu index c814c262a..0076a162f 100644 --- a/kernels/sqrt/sqrt_union1.mlu +++ b/kernels/sqrt/sqrt_union1.mlu @@ -33,9 +33,9 @@ #define SQRT_RECOVER 1e3 __nram__ float nram_tmp[NFU_ALIGN_SIZE]; -__nram__ char nram_buffer[BINARY_NRAM_SIZE]; +__nram__ int8_t nram_buffer[BINARY_NRAM_SIZE]; #if __BANG_ARCH__ != 520 -__mlu_shared__ char sram_buffer[BINARY_NRAM_SIZE]; +__mlu_shared__ int8_t sram_buffer[BINARY_NRAM_SIZE]; #endif template @@ -88,16 +88,16 @@ __mlu_func__ void funcSqrtFast(bfloat16_t *nram_output, bfloat16_t *nram_input, } template -__mlu_func__ void computeSqrtFast(char *nram_output, char *nram_input, - char *auxiliary_a, char *auxiliary_b, +__mlu_func__ void computeSqrtFast(int8_t *nram_output, int8_t *nram_input, + int8_t *auxiliary_a, int8_t *auxiliary_b, size_t deal_num, size_t actual_num) { funcSqrtFast((T2 *)nram_output, (T1 *)nram_input, (T2 *)auxiliary_a, (T2 *)auxiliary_b, actual_num, deal_num); } template -__mlu_func__ void computeSqrtHighAcc(char *nram_output, char *nram_input, - char *auxiliary_a, char *auxiliary_b, +__mlu_func__ void computeSqrtHighAcc(int8_t *nram_output, int8_t *nram_input, + int8_t *auxiliary_a, int8_t *auxiliary_b, size_t deal_num, size_t actual_num) { __bang_half2float((float *)nram_output, (half *)nram_input, deal_num); __bang_sqrt((float *)nram_output, (float *)nram_output, actual_num); @@ -192,8 +192,9 @@ __mlu_func__ void auxFunc5SqrtBackwardFast( template __mlu_func__ void computeSqrtBackwardHighAcc( - char *nram_output, char *nram_input1, char *nram_input2, char *auxiliary_a, - char *auxiliary_b, char *auxiliary_c, size_t deal_num, size_t actual_num) { + int8_t *nram_output, int8_t *nram_input1, int8_t *nram_input2, + int8_t *auxiliary_a, int8_t *auxiliary_b, int8_t *auxiliary_c, + size_t deal_num, size_t actual_num) { #if __BANG_ARCH__ != 520 // TODO(sram): tp_520 float *nram_fp_y = (float *)((DType_in1 *)nram_input1 - deal_num); float *nram_fp_dy = (float *)((DType_in1 *)nram_input2 - deal_num); @@ -216,10 +217,10 @@ __mlu_func__ void computeSqrtBackwardHighAcc( */ template -__mlu_func__ void computeSqrtBackwardFast(char *nram_output, char *nram_input1, - char *nram_input2, char *auxiliary_a, - char *auxiliary_b, char *auxiliary_c, - size_t deal_num, size_t actual_num) { +__mlu_func__ void computeSqrtBackwardFast( + int8_t *nram_output, int8_t *nram_input1, int8_t *nram_input2, + int8_t *auxiliary_a, int8_t *auxiliary_b, int8_t *auxiliary_c, + size_t deal_num, size_t actual_num) { #if __BANG_ARCH__ != 520 // TODO(sram): tp_520 #if __BANG_ARCH__ >= 300 __bang_mul_scalar((DType_in1 *)nram_input2, (DType_in1 *)nram_input2, @@ -252,17 +253,17 @@ mluOpStatus_t MLUOP_WIN_API Kernel3StagePipelineSqrt( // launch kernel if (d_type == mluOpDataType_t::MLUOP_DTYPE_FLOAT) { KERNEL_CHECK(MLUBlockKernel3StagePipelineSqrtFast - <<>>((char *)x, (char *)y, num)); + <<>>((int8_t *)x, (int8_t *)y, num)); } else if (d_type == mluOpDataType_t::MLUOP_DTYPE_INT32) { KERNEL_CHECK(MLUBlockKernel3StagePipelineSqrtFast - <<>>((char *)x, (char *)y, num)); + <<>>((int8_t *)x, (int8_t *)y, num)); } else if (d_type == mluOpDataType_t::MLUOP_DTYPE_HALF) { KERNEL_CHECK(MLUBlockKernel3StagePipelineSqrtHighAcc - <<>>((char *)x, (char *)y, num)); + <<>>((int8_t *)x, (int8_t *)y, num)); } else { // bfloat16 KERNEL_CHECK(MLUBlockKernel3StagePipelineSqrtFast - <<>>((char *)x, (char *)y, num)); + <<>>((int8_t *)x, (int8_t *)y, num)); } return MLUOP_STATUS_SUCCESS; } @@ -275,12 +276,14 @@ mluOpStatus_t MLUOP_WIN_API Kernel3StagePipelineSqrtBackward( if (d_type == mluOpDataType_t::MLUOP_DTYPE_HALF) { KERNEL_CHECK( MLUBlockKernel3StagePipelineSqrtBackwardHighAcc - <<>>((char *)y, (char *)diff_y, (char *)x, num)); + <<>>((int8_t *)y, (int8_t *)diff_y, (int8_t *)x, + num)); } else { // half KERNEL_CHECK( MLUBlockKernel3StagePipelineSqrtBackwardFast - <<>>((char *)y, (char *)diff_y, (char *)x, num)); + <<>>((int8_t *)y, (int8_t *)diff_y, (int8_t *)x, + num)); } return MLUOP_STATUS_SUCCESS; } diff --git a/kernels/tensor_stride_process/tensor_stride_in_block.mlu b/kernels/tensor_stride_process/tensor_stride_in_block.mlu index d2ca5560c..2bb868993 100644 --- a/kernels/tensor_stride_process/tensor_stride_in_block.mlu +++ b/kernels/tensor_stride_process/tensor_stride_in_block.mlu @@ -25,7 +25,7 @@ #include "mlu.h" #define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 12 * 1024) -__nram__ char ram[SIZE_NRAM_BUF]; +__nram__ int8_t ram[SIZE_NRAM_BUF]; template __mlu_func__ void blockTensorStridedIn(T *input, @@ -72,7 +72,7 @@ __mlu_global__ void MLUUnionKernelTensorStrideIn(const void *input, PERF_TIME_END(); } -template __mlu_global__ void MLUUnionKernelTensorStrideIn( +template __mlu_global__ void MLUUnionKernelTensorStrideIn( const void *input, mluop::TensorShape input_shape, void *output); template __mlu_global__ void MLUUnionKernelTensorStrideIn( const void *input, mluop::TensorShape input_shape, void *output); diff --git a/kernels/tensor_stride_process/tensor_stride_out_block.mlu b/kernels/tensor_stride_process/tensor_stride_out_block.mlu index cdb87f972..c099f34f2 100644 --- a/kernels/tensor_stride_process/tensor_stride_out_block.mlu +++ b/kernels/tensor_stride_process/tensor_stride_out_block.mlu @@ -25,7 +25,7 @@ #include "mlu.h" #define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 12 * 1024) -__nram__ char ram[SIZE_NRAM_BUF]; +__nram__ int8_t ram[SIZE_NRAM_BUF]; template __mlu_func__ void blockTensorStridedOut(T *input, T *output, @@ -66,7 +66,7 @@ __mlu_global__ void MLUUnionKernelTensorStrideOut( PERF_TIME_END(); } -template __mlu_global__ void MLUUnionKernelTensorStrideOut( +template __mlu_global__ void MLUUnionKernelTensorStrideOut( const void *input, void *output, mluop::TensorShape output_shape); template __mlu_global__ void MLUUnionKernelTensorStrideOut( const void *input, void *output, mluop::TensorShape output_shape); diff --git a/kernels/tensor_stride_process/tensor_stride_process_host.cpp b/kernels/tensor_stride_process/tensor_stride_process_host.cpp index 0099a111a..410112258 100644 --- a/kernels/tensor_stride_process/tensor_stride_process_host.cpp +++ b/kernels/tensor_stride_process/tensor_stride_process_host.cpp @@ -386,9 +386,9 @@ static mluOpStatus_t policyFunc(mluOpHandle_t handle, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type, uint64_t total_num) { if (handle->sram_size <= 0) { - *k_type = CNRT_FUNC_TYPE_BLOCK; + *k_type = cnrtFuncTypeBlock; } else { - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; } uint32_t union_number = mluop::runtime::getClusterLimitCapability(handle); diff --git a/kernels/tensor_stride_process/tensor_stride_process_mlu.mlu b/kernels/tensor_stride_process/tensor_stride_process_mlu.mlu index 2bc9983fc..f47281507 100644 --- a/kernels/tensor_stride_process/tensor_stride_process_mlu.mlu +++ b/kernels/tensor_stride_process/tensor_stride_process_mlu.mlu @@ -42,7 +42,7 @@ mluOpStatus_t MLUOP_WIN_API KernelTensorStrideIn( case MLUOP_DTYPE_INT8: case MLUOP_DTYPE_BOOL: case MLUOP_DTYPE_UINT8: { - KERNEL_CHECK(MLUUnionKernelTensorStrideIn + KERNEL_CHECK(MLUUnionKernelTensorStrideIn <<>>(input, input_shape, output)); } break; @@ -72,7 +72,7 @@ KernelTensorStrideOut(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, case MLUOP_DTYPE_INT8: case MLUOP_DTYPE_BOOL: case MLUOP_DTYPE_UINT8: { - KERNEL_CHECK(MLUUnionKernelTensorStrideOut + KERNEL_CHECK(MLUUnionKernelTensorStrideOut <<>>(input, output, output_shape)); } break; diff --git a/kernels/three_interpolate/three_interpolate.cpp b/kernels/three_interpolate/three_interpolate.cpp index d780c9d15..0b0e3c142 100644 --- a/kernels/three_interpolate/three_interpolate.cpp +++ b/kernels/three_interpolate/three_interpolate.cpp @@ -250,7 +250,7 @@ static void policyFuncThreeInterpolate( use_cluster = (b * m_aligned_limit / m_limit + core_in_cluster - 1) / core_in_cluster; } - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; k_dim->x = core_in_cluster; k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster; k_dim->z = 1; diff --git a/kernels/three_interpolate/three_interpolate_union1.mlu b/kernels/three_interpolate/three_interpolate_union1.mlu index 66d825d3b..1d0ba2e0e 100644 --- a/kernels/three_interpolate/three_interpolate_union1.mlu +++ b/kernels/three_interpolate/three_interpolate_union1.mlu @@ -27,7 +27,7 @@ #include "kernels/kernel.h" #include "kernels/utils/common.h" -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; #define MIN(x, y) ((x) < (y) ? (x) : (y)) #define BATCH_LIMIT 1 #define INDEX_WEIGHT_LAST_DIM 3 @@ -43,8 +43,8 @@ __mlu_func__ void memcpy2D(T *dst, const T *src, uint32_t data_size, mluMemcpyDirection_t dir, uint64_t dst_stride, uint64_t src_stride, uint32_t segnum) { for (uint32_t loopi = 0; loopi <= segnum; ++loopi) { - char *dst_addr = (char *)dst + loopi * dst_stride * sizeof(T); - char *src_addr = (char *)src + loopi * src_stride * sizeof(T); + int8_t *dst_addr = (int8_t *)dst + loopi * dst_stride * sizeof(T); + int8_t *src_addr = (int8_t *)src + loopi * src_stride * sizeof(T); __memcpy(dst_addr, src_addr, data_size * sizeof(T), dir); } } @@ -58,7 +58,6 @@ __mlu_func__ void selectIndicesBetweenMinAndMaxWithoutLimit( const uint32_t m_min, const uint32_t m_max, const uint32_t index, const uint32_t n_limit, const uint32_t c_limit, const uint32_t m_limit_org) { -#if __BANG_ARCH__ >= 370 && __BANG_ARCH__ != 520 // select the offset between the m_min and m_max // judge if less than m_max __bang_ge_scalar((int32_t *)nram_indices_transpose_float_addition, @@ -72,7 +71,6 @@ __mlu_func__ void selectIndicesBetweenMinAndMaxWithoutLimit( __bang_and((int32_t *)nram_indices_transpose_addition, (int32_t *)nram_indices_transpose_float_addition, (int32_t *)nram_indices_transpose_addition, n_limit); -#if __BANG_ARCH__ >= 322 // extra process for the nan/inf // set weights to be 0 for the indices not in range of [m_min, m_max) if (sizeof(T) == sizeof(float)) { @@ -80,9 +78,9 @@ __mlu_func__ void selectIndicesBetweenMinAndMaxWithoutLimit( __bang_mul_scalar((int32_t *)nram_mask_int32, (int32_t *)nram_indices_transpose_addition, (int32_t)INT32_MAX_MASK, n_limit); - __bang_band((char *)(nram_weights_transpose + index * n_limit), - (char *)(nram_weights_transpose + index * n_limit), - (char *)nram_mask_int32, INT32_MASK_REPEAT_TIMES * n_limit); + __bang_band((int8_t *)(nram_weights_transpose + index * n_limit), + (int8_t *)(nram_weights_transpose + index * n_limit), + (int8_t *)nram_mask_int32, INT32_MASK_REPEAT_TIMES * n_limit); } else if (sizeof(T) == sizeof(half)) { int16_t *nram_mask_int16 = (int16_t *)nram_indices_transpose_float_addition; __bang_int322int16(nram_mask_int16, @@ -90,11 +88,10 @@ __mlu_func__ void selectIndicesBetweenMinAndMaxWithoutLimit( 0); __bang_mul_scalar((int16_t *)nram_mask_int16, (int16_t *)nram_mask_int16, (int16_t)INT16_MAX_MASK, n_limit); - __bang_band((char *)(nram_weights_transpose + index * n_limit), - (char *)(nram_weights_transpose + index * n_limit), - (char *)nram_mask_int16, INT16_MASK_REPEAT_TIMES * n_limit); + __bang_band((int8_t *)(nram_weights_transpose + index * n_limit), + (int8_t *)(nram_weights_transpose + index * n_limit), + (int8_t *)nram_mask_int16, INT16_MASK_REPEAT_TIMES * n_limit); } -#endif // multiply the indices with values in the range of [m_min, m_max) __bang_mul((int32_t *)nram_indices_transpose_float, nram_indices_transpose + index * n_limit, @@ -119,7 +116,6 @@ __mlu_func__ void selectIndicesBetweenMinAndMaxWithoutLimit( // get the beginning offset by multiply c_limit __bang_mul_scalar(nram_indices, (int32_t *)nram_indices_transpose_float, c_limit, n_limit); -#endif } template @@ -130,11 +126,8 @@ __mlu_func__ void selectIndicesBetweenMinAndMax( const uint32_t m_min, const uint32_t m_max, const uint32_t index, const uint32_t n_limit, const uint32_t c_limit, const uint32_t m_limit_org) { - // convert indices from int32_t to float for 270 - __mluop_int322float(nram_indices_transpose_float, - nram_indices_transpose_float_addition, - nram_indices_transpose + index * n_limit, - nram_indices_transpose_addition, n_limit); + __bang_int322float(nram_indices_transpose_float, + nram_indices_transpose + index * n_limit, n_limit, 0); // select the offset between the m_min and m_max // judge if less than m_max __bang_ge_scalar(nram_indices_transpose_float_addition, @@ -148,7 +141,6 @@ __mlu_func__ void selectIndicesBetweenMinAndMax( __bang_and(nram_indices_transpose_addition, nram_indices_transpose_float_addition, nram_indices_transpose_addition, n_limit); -#if __BANG_ARCH__ >= 322 // extra process for the nan/inf // set weights to be 0 for the indices not in range of [m_min, m_max) if (sizeof(T) == sizeof(float)) { @@ -157,20 +149,19 @@ __mlu_func__ void selectIndicesBetweenMinAndMax( n_limit, 0); __bang_mul_scalar((int32_t *)nram_mask_int32, (int32_t *)nram_mask_int32, (int32_t)INT32_MAX_MASK, n_limit); - __bang_band((char *)(nram_weights_transpose + index * n_limit), - (char *)(nram_weights_transpose + index * n_limit), - (char *)nram_mask_int32, INT32_MASK_REPEAT_TIMES * n_limit); + __bang_band((int8_t *)(nram_weights_transpose + index * n_limit), + (int8_t *)(nram_weights_transpose + index * n_limit), + (int8_t *)nram_mask_int32, INT32_MASK_REPEAT_TIMES * n_limit); } else if (sizeof(T) == sizeof(half)) { int16_t *nram_mask_int16 = (int16_t *)nram_indices_transpose_float_addition; __bang_float2int16_rd(nram_mask_int16, nram_indices_transpose_addition, n_limit, 0); __bang_mul_scalar((int16_t *)nram_mask_int16, (int16_t *)nram_mask_int16, (int16_t)INT16_MAX_MASK, n_limit); - __bang_band((char *)(nram_weights_transpose + index * n_limit), - (char *)(nram_weights_transpose + index * n_limit), - (char *)nram_mask_int16, INT16_MASK_REPEAT_TIMES * n_limit); + __bang_band((int8_t *)(nram_weights_transpose + index * n_limit), + (int8_t *)(nram_weights_transpose + index * n_limit), + (int8_t *)nram_mask_int16, INT16_MASK_REPEAT_TIMES * n_limit); } -#endif // multiply the indices with values in the range of [m_min, m_max) __bang_mul(nram_indices_transpose_float, nram_indices_transpose_float, nram_indices_transpose_addition, n_limit); @@ -193,10 +184,7 @@ __mlu_func__ void selectIndicesBetweenMinAndMax( // get the beginning offset by multiply c_limit __bang_mul_scalar(nram_indices_transpose_float, nram_indices_transpose_float, c_limit, n_limit); - // convert the indices from float type back to int for 270 - __mluop_float2int32(nram_indices, nram_indices_transpose_addition, - nram_indices_transpose_float, - nram_indices_transpose_float_addition, n_limit); + __bang_float2int32(nram_indices, nram_indices_transpose_float, n_limit, 0); } template @@ -327,12 +315,10 @@ __mlu_global__ void MLUKernelThreeInterpolateForward( sizeof(T), NRAM2NRAM, sizeof(T), INDEX_WEIGHT_LAST_DIM * sizeof(T), actual_n_size - 1); } -#if __BANG_ARCH__ >= 322 // extra process for the nan/inf // backup the weights after transpose __memcpy(nram_weights, nram_weights_transpose, weights_deal_size * sizeof(T), NRAM2NRAM); -#endif uint32_t c_rem = c; for (uint32_t j = 0; j < c_repeated_times; ++j) { @@ -384,13 +370,6 @@ __mlu_global__ void MLUKernelThreeInterpolateForward( __bang_write_zero(nram_features_selected, output_deal_size); // 2.2 select the offset between the m_min and m_max // convert indices from int32_t to float -#if __BANG_ARCH__ < 370 || __BANG_ARCH__ == 520 - selectIndicesBetweenMinAndMax( - nram_indices, nram_indices_transpose, - nram_indices_transpose_addition, nram_indices_transpose_float, - nram_indices_transpose_float_addition, nram_weights_transpose, - m_min, m_max, index, n_limit, c_limit, m_limit_org); -#else if (m <= INT2FLOAT_KEEP_PRECISION_MAX_VALUE) { // float compute force is bigger than int selectIndicesBetweenMinAndMax( @@ -407,7 +386,6 @@ __mlu_global__ void MLUKernelThreeInterpolateForward( nram_weights_transpose, m_min, m_max, index, n_limit, c_limit, m_limit_org); } -#endif // select the features from m*c to n*c // 2.3 select the Mo*Co according to the indices for (uint32_t s = 0; s < actual_n_size; ++s) { @@ -427,13 +405,11 @@ __mlu_global__ void MLUKernelThreeInterpolateForward( // 2.6 add the different index's results __bang_add(nram_output, nram_features_selected, nram_output, c_limit * n_limit); -#if __BANG_ARCH__ >= 322 // extra process for the nan/inf // restore the nram_weights_transpose from nram_weights __memcpy(nram_weights_transpose + index * n_limit, nram_weights + index * n_limit, n_limit * sizeof(T), NRAM2NRAM); -#endif } // index c_limit = c_limit_org; m_limit = m_limit_org; @@ -612,12 +588,10 @@ __mlu_global__ void MLUKernelThreeInterpolateBackward( nram_weights + index, sizeof(T), NRAM2NRAM, sizeof(T), INDEX_WEIGHT_LAST_DIM * sizeof(T), n_slice - 1); } -#if __BANG_ARCH__ >= 322 // extra process for the nan/inf // backup the weights after transpose __memcpy(nram_weights, nram_weights_transpose, weights_deal_size * sizeof(T), NRAM2NRAM); -#endif // initial nram_grad_output_transpose with zero // and set extra c_limit size that will be selected by the index not in // [m_min, m_max) @@ -628,13 +602,6 @@ __mlu_global__ void MLUKernelThreeInterpolateBackward( for (uint32_t index = 0; index < INDEX_WEIGHT_LAST_DIM; ++index) { // select the offset between the m_min and m_max // convert indices from int32_t to float -#if __BANG_ARCH__ < 370 || __BANG_ARCH__ == 520 - selectIndicesBetweenMinAndMax( - nram_indices, nram_indices_transpose, - nram_indices_transpose_addition, nram_indices_transpose_float, - nram_indices_transpose_float_addition, nram_weights_transpose, - m_min, m_max, index, n_limit, c_limit, m_limit_org); -#else if (m <= INT2FLOAT_KEEP_PRECISION_MAX_VALUE) { // float compute force is bigger than int selectIndicesBetweenMinAndMax( @@ -651,7 +618,6 @@ __mlu_global__ void MLUKernelThreeInterpolateBackward( nram_weights_transpose, m_min, m_max, index, n_limit, c_limit, m_limit_org); } -#endif // mul the grad_output and weights __bang_cycle_mul(nram_grad_features_transpose, nram_grad_output, nram_weights_transpose + index * n_limit, @@ -666,13 +632,11 @@ __mlu_global__ void MLUKernelThreeInterpolateBackward( nram_grad_features + selected_index, nram_grad_output_transpose + s * c_limit, c_limit); } -#if __BANG_ARCH__ >= 322 // extra process for the nan/inf // restore the nram_weights_transpose from nram_weights __memcpy(nram_weights_transpose + index * n_limit, nram_weights + index * n_limit, n_limit * sizeof(T), NRAM2NRAM); -#endif } // index c_limit = c_limit_org; n_limit = n_limit_org; diff --git a/kernels/three_nn_forward/three_nn_forward.cpp b/kernels/three_nn_forward/three_nn_forward.cpp index ed2fb0003..c74c64d71 100644 --- a/kernels/three_nn_forward/three_nn_forward.cpp +++ b/kernels/three_nn_forward/three_nn_forward.cpp @@ -224,14 +224,14 @@ mluOpStatus_t MLUOP_WIN_API mluOpThreeNNForward( mluOpDataType_t input_dtype = unknown_desc->dtype; void *known_workspace = workspace; void *transpose_workspace = - (char *)known_workspace + known_desc->total_tensor_size; + (int8_t *)known_workspace + known_desc->total_tensor_size; // start U1 task, occupy all available clusters cnrtDim3_t k_dims; k_dims.x = mluop::runtime::getCoreNumOfEachUnionCapability(handle); k_dims.y = mluop::runtime::getClusterLimitCapability(handle); k_dims.z = 1; - cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1; + cnrtFunctionType_t k_type = cnrtFuncTypeUnion1; VLOG(5) << "[mluOpThreeNNForward] cnnlTranspose_v2 feature start."; diff --git a/kernels/three_nn_forward/three_nn_forward_union1.mlu b/kernels/three_nn_forward/three_nn_forward_union1.mlu index 288be2906..0e1f95511 100644 --- a/kernels/three_nn_forward/three_nn_forward_union1.mlu +++ b/kernels/three_nn_forward/three_nn_forward_union1.mlu @@ -29,12 +29,11 @@ #include "kernels/kernel.h" #include "kernels/utils/common.h" -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; -#if __BANG_ARCH__ >= 322 /** * returns the index of ret, which is stored at the 1st position of the `ret`, - * used after bang_min + * used after bang_argmin */ __mlu_func__ uint32_t getIndice(half *ret) { uint32_t indice = *((uint32_t *)((uint16_t *)ret + 1)); @@ -43,23 +42,21 @@ __mlu_func__ uint32_t getIndice(half *ret) { /** * returns the index of ret, which is stored at the 1st position of the `ret`, - * used after bang_min + * used after bang_argmin */ __mlu_func__ uint32_t getIndice(float *ret) { uint32_t indice = ((uint32_t *)ret)[1]; return indice; } -#endif template __mlu_func__ void auxArgmin(T *nram_dst, T *nram_src, const int num_deal, T *value, int *index) { - __bang_min(nram_dst, nram_src, num_deal); + __bang_argmin(nram_dst, nram_src, num_deal); *value = nram_dst[0]; __bang_write_value(nram_dst, num_deal, *value); __bang_eq(nram_dst, nram_src, nram_dst, num_deal); - __bang_findfirst1((uint32_t *)nram_dst, nram_dst, num_deal); - *index = *((int *)nram_dst); + *index = (uint32_t)__bang_findfirst1(nram_dst, num_deal); } template @@ -71,21 +68,15 @@ __mlu_func__ void auxFuncFind3Min(T *nram_aux_a, const int auxa_offset, __bang_write_value(nram_aux_sort_b, auxb_offset, (int)0); int index = 0; for (int i = 0; i < 3; i++) { -#if __BANG_ARCH__ >= 322 __bang_argmin(nram_dest, nram_aux_a, auxa_offset); nram_aux_sort_a[i] = nram_dest[0]; index = getIndice(nram_dest); -#else - T value = 0; - auxArgmin(nram_dest, nram_aux_a, auxa_offset, &value, &index); - nram_aux_sort_a[i] = value; -#endif nram_aux_sort_b[i] = nram_aux_b[index]; __memset_nram(nram_aux_a + index, 1, (T)(INFINITY)); } - __memcpy((char *)nram_aux_a, (char *)nram_aux_sort_a, auxa_offset * sizeof(T), - NRAM2NRAM); - __memcpy((char *)nram_aux_b, (char *)nram_aux_sort_b, + __memcpy((int8_t *)nram_aux_a, (int8_t *)nram_aux_sort_a, + auxa_offset * sizeof(T), NRAM2NRAM); + __memcpy((int8_t *)nram_aux_b, (int8_t *)nram_aux_sort_b, auxb_offset * sizeof(int), NRAM2NRAM); } @@ -127,19 +118,11 @@ __mlu_func__ void auxFuncNN( *align_num = NFU_ALIGN_SIZE / sizeof(T); *auxa_offset = NFU_ALIGN_SIZE / sizeof(T); *auxb_offset = 2 * NFU_ALIGN_SIZE / sizeof(int); -#if __BANG_ARCH__ >= 322 *known_num_deal = PAD_DOWN( (MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 5 / (7 * sizeof(T)), *align_num); *unknown_num_deal = PAD_DOWN((MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 5 * 4 / (3 * sizeof(T) + 3 * NFU_ALIGN_SIZE), *align_num); -#else - *known_num_deal = PAD_DOWN( - (MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 3 / (7 * sizeof(T)), *align_num); - *unknown_num_deal = PAD_DOWN((MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 3 * 2 / - (3 * sizeof(T) + 3 * NFU_ALIGN_SIZE), - *align_num); -#endif *output_aux_sort_a_gap = 0; *output_aux_sort_b_gap = *output_aux_sort_a_gap + NFU_ALIGN_SIZE; @@ -152,7 +135,6 @@ __mlu_func__ void auxFuncNN( *auxillary_b_gap = *auxillary_a_gap + *unknown_num_deal * NFU_ALIGN_SIZE; } -#if __BANG_ARCH__ >= 322 template __mlu_func__ bool containNanInf(T *nram_unknown) { if (std::isnan(nram_unknown[0]) || std::isnan(nram_unknown[1]) || @@ -162,7 +144,6 @@ __mlu_func__ bool containNanInf(T *nram_unknown) { else return false; } -#endif template __mlu_func__ void computeThreeNN(T *nram_unknown, T *nram_known, T *nram_dist, @@ -173,9 +154,7 @@ __mlu_func__ void computeThreeNN(T *nram_unknown, T *nram_known, T *nram_dist, const int known_count, const int known_count_align) { __bang_write_value(nram_dist, 3 * known_num_deal, (T)(INFINITY)); -#if __BANG_ARCH__ >= 322 if (!containNanInf(nram_unknown)) { -#endif // x1 - x2 __bang_sub_scalar(nram_dist, nram_known, nram_unknown[0], known_count_align); @@ -192,21 +171,13 @@ __mlu_func__ void computeThreeNN(T *nram_unknown, T *nram_known, T *nram_dist, known_count_align); __bang_add(nram_dist, nram_dist, nram_dist + 2 * known_count_align, known_count_align); -#if __BANG_ARCH__ >= 322 } -#endif int index = 0; for (int i = 0; i < 3; i++) { -#if __BANG_ARCH__ >= 322 __bang_argmin(nram_dest, nram_dist, known_count_align); nram_aux_a[i + deal_offset] = nram_dest[0]; index = getIndice(nram_dest); -#else - T value = 0; - auxArgmin(nram_dest, nram_dist, known_count_align, &value, &index); - nram_aux_a[i + deal_offset] = value; -#endif nram_aux_b[i + deal_offset] = index + known_seg_num * known_num_deal; __memset_nram(nram_dist + index, 1, (T)(INFINITY)); } @@ -218,19 +189,12 @@ __mlu_func__ void loadTransposedKnownTensor( const int batch_id, const int m, const int known_seg_num, const int count, const int count_align_num) { __bang_write_value(nram_known, 3 * known_num_deal, (T)(INFINITY)); -#if __BANG_ARCH__ >= 322 __bang_write_value(nram_dist, 3 * known_num_deal, (T)(INFINITY)); __memcpy(nram_dist, known_gdram + (batch_id * m * 3 + known_seg_num * known_num_deal), count * sizeof(T), GDRAM2NRAM, count_align_num * sizeof(T), m * sizeof(T), 2); __bang_minequal(nram_known, nram_known, nram_dist, 3 * count_align_num); -#else - __memcpy(nram_known, - known_gdram + (batch_id * m * 3 + known_seg_num * known_num_deal), - count * sizeof(T), GDRAM2NRAM, count_align_num * sizeof(T), - m * sizeof(T), 2); -#endif } template diff --git a/kernels/tin_shift/tin_shift.cpp b/kernels/tin_shift/tin_shift.cpp index 25ea0b739..215d98186 100644 --- a/kernels/tin_shift/tin_shift.cpp +++ b/kernels/tin_shift/tin_shift.cpp @@ -71,7 +71,7 @@ static void policyFunc(const mluOpHandle_t handle, k_dim->x = core_limit; k_dim->y = (task_dim / core_limit) > 0 ? (task_dim / core_limit) : 1; k_dim->z = 1; - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; } static mluOpStatus_t TinShiftPreCheck( diff --git a/kernels/tin_shift/tin_shift_union1.mlu b/kernels/tin_shift/tin_shift_union1.mlu index 89c031418..95dc2216a 100644 --- a/kernels/tin_shift/tin_shift_union1.mlu +++ b/kernels/tin_shift/tin_shift_union1.mlu @@ -25,7 +25,7 @@ #include "core/logging.h" #include "kernels/kernel.h" -__nram__ char data_nram[MAX_NRAM_SIZE]; +__nram__ int8_t data_nram[MAX_NRAM_SIZE]; template __mlu_func__ void mluMultiKernelTinShift( @@ -116,7 +116,7 @@ __mlu_func__ void mluMultiKernelTinShiftSplitSequence( (cur_segment_index / loop_time % channel_size) * hw_size + cur_segment_index % loop_time * segmentime_size * hw_size * channel_size; - char *dst_gdram2nram = data_nram; + int8_t *dst_gdram2nram = data_nram; const T *src_gdram2nram = input + index; int count_gdram2nram = -1; int count_nram2gdram = -1; diff --git a/kernels/unary_op/complex_unary_op_3pipeline.h b/kernels/unary_op/complex_unary_op_3pipeline.h index a32b43ddc..e81f83f1e 100644 --- a/kernels/unary_op/complex_unary_op_3pipeline.h +++ b/kernels/unary_op/complex_unary_op_3pipeline.h @@ -34,12 +34,12 @@ #define COMPLEX_UNARY_OP_KERNEL_3PIPELINE_DECLARE(Op, Prefer) \ template \ __mlu_global__ void MLUBlockKernel3StagePipelineComplex##Op##Prefer( \ - char *x, char *y, size_t element_num, Args... args); + int8_t *x, int8_t *y, size_t element_num, Args... args); #define COMPLEX_UNARY_OP_KERNEL_3PIPELINE_IMPLE(Op, Prefer) \ template \ __mlu_global__ void MLUBlockKernel3StagePipelineComplex##Op##Prefer( \ - char *input_gdram, char *output_gdram, size_t element_num, \ + int8_t *input_gdram, int8_t *output_gdram, size_t element_num, \ Args... args) { \ if (__is_mpu()) { \ return; \ @@ -53,9 +53,9 @@ span_num_deal, align_num, args...); \ size_t num_per_core = element_num / taskDim; \ size_t num_rem = element_num % taskDim; \ - char *input_start = \ + int8_t *input_start = \ input_gdram + taskId * num_per_core * sizeof(DType_in); \ - char *output_start = \ + int8_t *output_start = \ output_gdram + taskId * num_per_core * sizeof(DType_out); \ if (num_rem > 0 && taskId == taskDim - 1) { \ num_per_core = num_per_core + num_rem; \ @@ -63,10 +63,10 @@ int repeat = num_per_core / span_num_deal; \ size_t rem = num_per_core % span_num_deal; \ size_t align_rem = CEIL_ALIGN(rem, align_num); \ - char *ping_output = nram_buffer; \ - char *ping_input = nram_buffer + output_input_gap; \ - char *auxiliary_a = nram_buffer + auxiliary_a_gap; \ - char *auxiliary_b = nram_buffer + auxiliary_b_gap; \ + int8_t *ping_output = nram_buffer; \ + int8_t *ping_input = nram_buffer + output_input_gap; \ + int8_t *auxiliary_a = nram_buffer + auxiliary_a_gap; \ + int8_t *auxiliary_b = nram_buffer + auxiliary_b_gap; \ size_t span_load_size = span_num_deal * sizeof(DType_in); \ size_t span_store_size = span_num_deal * sizeof(DType_out); \ if (repeat > 0) { \ @@ -82,11 +82,9 @@ __asm__ volatile("sync;"); \ } \ for (int i = 0; i < repeat - 2; i++) { \ - pvLock(); \ __memcpy_async(output_start + i * span_store_size, \ ping_output + (i % 2) * ping_pong_gap, span_store_size, \ NRAM2GDRAM); \ - pvUnlock(); \ __memcpy_async(ping_input + (i % 2) * ping_pong_gap, \ input_start + (i + 2) * span_load_size, span_load_size, \ GDRAM2NRAM); \ diff --git a/kernels/unary_op/complex_unary_op_stride_3pipeline.h b/kernels/unary_op/complex_unary_op_stride_3pipeline.h index 0c9070bf4..7df334c58 100644 --- a/kernels/unary_op/complex_unary_op_stride_3pipeline.h +++ b/kernels/unary_op/complex_unary_op_stride_3pipeline.h @@ -36,15 +36,16 @@ template \ __mlu_global__ void \ MLUBlockKernel3StagePipelineWithStrideComplex##Op##Prefer( \ - char *x, mluop::TensorShape x_shape, char *y, \ + int8_t *x, mluop::TensorShape x_shape, int8_t *y, \ mluop::TensorShape y_shape, size_t element_num, Args... args); #define COMPLEX_UNARY_OP_KERNEL_3PIPELINE_WITH_STRIDE_IMPLE(Op, Prefer) \ template \ __mlu_global__ void \ MLUBlockKernel3StagePipelineWithStrideComplex##Op##Prefer( \ - char *input_gdram, mluop::TensorShape x_shape, char *output_gdram, \ - mluop::TensorShape y_shape, size_t element_num, Args... args) { \ + int8_t *input_gdram, mluop::TensorShape x_shape, \ + int8_t *output_gdram, mluop::TensorShape y_shape, \ + size_t element_num, Args... args) { \ if (__is_mpu()) { \ return; \ } \ @@ -73,11 +74,11 @@ size_t rem = num_per_core % num_deal; \ size_t rem_align = CEIL_ALIGN(rem, align_num); \ \ - char *ping_output = nram_buffer; \ - char *ping_input = nram_buffer + output_input_gap; \ + int8_t *ping_output = nram_buffer; \ + int8_t *ping_input = nram_buffer + output_input_gap; \ /* Two auxiliary pointers.*/ \ - char *auxiliary_a = nram_buffer + auxiliary_a_gap; \ - char *auxiliary_b = nram_buffer + auxiliary_b_gap; \ + int8_t *auxiliary_a = nram_buffer + auxiliary_a_gap; \ + int8_t *auxiliary_b = nram_buffer + auxiliary_b_gap; \ \ if (repeat > 0) { \ tensorStrideLoad(ping_input, input_gdram, task_offset, \ diff --git a/kernels/unary_op/unary_op_3pipeline.h b/kernels/unary_op/unary_op_3pipeline.h index 28c88e960..5d247098c 100644 --- a/kernels/unary_op/unary_op_3pipeline.h +++ b/kernels/unary_op/unary_op_3pipeline.h @@ -37,12 +37,12 @@ #define UNARY_OP_KERNEL_3PIPELINE_DECLARE(Op, Prefer) \ template \ __mlu_global__ void MLUBlockKernel3StagePipeline##Op##Prefer( \ - char *x, char *y, size_t element_num, Args... args); + int8_t *x, int8_t *y, size_t element_num, Args... args); #define UNARY_OP_KERNEL_3PIPELINE_IMPLE(Op, Prefer) \ template \ __mlu_global__ void MLUBlockKernel3StagePipeline##Op##Prefer( \ - char *input_gdram, char *output_gdram, size_t element_num, \ + int8_t *input_gdram, int8_t *output_gdram, size_t element_num, \ Args... args) { \ if (__is_mpu()) { \ return; \ @@ -56,9 +56,9 @@ span_num_deal, align_num, args...); \ size_t num_per_core = element_num / taskDim; \ size_t num_rem = element_num % taskDim; \ - char *input_start = \ + int8_t *input_start = \ input_gdram + taskId * num_per_core * sizeof(DType_in); \ - char *output_start = \ + int8_t *output_start = \ output_gdram + taskId * num_per_core * sizeof(DType_out); \ if (num_rem > 0 && taskId == taskDim - 1) { \ num_per_core = num_per_core + num_rem; \ @@ -66,10 +66,10 @@ int repeat = num_per_core / span_num_deal; \ size_t rem = num_per_core % span_num_deal; \ size_t align_rem = CEIL_ALIGN(rem, align_num); \ - char *ping_output = nram_buffer; \ - char *ping_input = nram_buffer + output_input_gap; \ - char *auxiliary_a = nram_buffer + auxiliary_a_gap; \ - char *auxiliary_b = nram_buffer + auxiliary_b_gap; \ + int8_t *ping_output = nram_buffer; \ + int8_t *ping_input = nram_buffer + output_input_gap; \ + int8_t *auxiliary_a = nram_buffer + auxiliary_a_gap; \ + int8_t *auxiliary_b = nram_buffer + auxiliary_b_gap; \ size_t span_load_size = span_num_deal * sizeof(DType_in); \ size_t span_store_size = span_num_deal * sizeof(DType_out); \ if (repeat > 0) { \ @@ -85,11 +85,9 @@ __asm__ volatile("sync;"); \ } \ for (int i = 0; i < repeat - 2; i++) { \ - pvLock(); \ __memcpy_async(output_start + i * span_store_size, \ ping_output + (i % 2) * ping_pong_gap, span_store_size, \ NRAM2GDRAM); \ - pvUnlock(); \ __memcpy_async(ping_input + (i % 2) * ping_pong_gap, \ input_start + (i + 2) * span_load_size, span_load_size, \ GDRAM2NRAM); \ @@ -137,7 +135,7 @@ #define UNARY_OP_KERNEL_3PIPELINE_V2_DECLARE(Op, Prefer) \ template \ __mlu_global__ void MLUBlockKernel3StagePipelineV2##Op##Prefer( \ - char *x, char *y, size_t normal_core_elem_num, \ + int8_t *x, int8_t *y, size_t normal_core_elem_num, \ size_t tail_core_elem_num, uint32_t output_input_gap, \ uint32_t ping_pong_gap, uint32_t auxiliary_a_gap, \ uint32_t auxiliary_b_gap, uint32_t span_num_deal, uint32_t align_num, \ @@ -147,14 +145,14 @@ #define UNARY_OP_KERNEL_3PIPELINE_V2_IMPLE(Op, Prefer) \ template \ __mlu_global__ void MLUBlockKernel3StagePipelineV2##Op##Prefer( \ - char *input_gdram, char *output_gdram, size_t normal_core_elem_num, \ + int8_t *input_gdram, int8_t *output_gdram, size_t normal_core_elem_num, \ size_t tail_core_elem_num, uint32_t output_input_gap, \ uint32_t ping_pong_gap, uint32_t auxiliary_a_gap, \ uint32_t auxiliary_b_gap, uint32_t span_num_deal, uint32_t align_num, \ Args... args) { \ - const char *const input_start = \ + const int8_t *const input_start = \ input_gdram + taskId * normal_core_elem_num * sizeof(DType_in); \ - char *const output_start = \ + int8_t *const output_start = \ output_gdram + taskId * normal_core_elem_num * sizeof(DType_out); \ const size_t num_cur_core = \ (taskId + 1 == taskDim) ? tail_core_elem_num : normal_core_elem_num; \ @@ -162,10 +160,10 @@ const uint32_t repeat = num_cur_core / span_num_deal; \ const uint32_t rem = num_cur_core % span_num_deal; \ const uint32_t align_rem = CEIL_ALIGN(rem, align_num); \ - char *ping_output = nram_buffer; \ - char *ping_input = nram_buffer + output_input_gap; \ - char *auxiliary_a = nram_buffer + auxiliary_a_gap; \ - char *auxiliary_b = nram_buffer + auxiliary_b_gap; \ + int8_t *ping_output = nram_buffer; \ + int8_t *ping_input = nram_buffer + output_input_gap; \ + int8_t *auxiliary_a = nram_buffer + auxiliary_a_gap; \ + int8_t *auxiliary_b = nram_buffer + auxiliary_b_gap; \ size_t span_load_size = span_num_deal * U32_SIZE_OF(DType_in); \ size_t span_store_size = span_num_deal * U32_SIZE_OF(DType_out); \ \ diff --git a/kernels/unary_op/unary_op_4pipeline.h b/kernels/unary_op/unary_op_4pipeline.h index b29297676..d8c417d77 100644 --- a/kernels/unary_op/unary_op_4pipeline.h +++ b/kernels/unary_op/unary_op_4pipeline.h @@ -31,7 +31,7 @@ #define UNARY_OP_4PIPELINE_DECLARE(Op, Prefer) \ template \ __mlu_global__ void MLUBlockKernel4StagePipeline##Op##Prefer( \ - char *x, char *y, size_t element_num, Args... args); + int8_t *x, int8_t *y, size_t element_num, Args... args); #define PRINTF(fmt, ...) \ if (false) { \ @@ -65,7 +65,7 @@ __mlu_func__ void strategyOfPartitionCore(size_t remain_num, #define UNARY_OP_4PIPELINE_IMPLEMENTATION(Op, Prefer) \ template \ __mlu_global__ void MLUBlockKernel4StagePipeline##Op##Prefer( \ - char *input_gdram, char *output_gdram, size_t element_num, \ + int8_t *input_gdram, int8_t *output_gdram, size_t element_num, \ Args... args) { \ size_t output_input_gap = 0, ping_pong_gap = 0; \ size_t auxiliary_a_gap = 0, auxiliary_b_gap = 0; \ @@ -76,8 +76,8 @@ __mlu_func__ void strategyOfPartitionCore(size_t remain_num, span_num_deal, align_num, args...); \ size_t cluster_num_deal = 0, cluster_offset = 0; \ strategyOfPartitionCluster(element_num, cluster_num_deal, cluster_offset); \ - char *load_start = input_gdram + cluster_offset * sizeof(DType_in); \ - char *store_start = output_gdram + cluster_offset * sizeof(DType_out); \ + int8_t *load_start = input_gdram + cluster_offset * sizeof(DType_in); \ + int8_t *store_start = output_gdram + cluster_offset * sizeof(DType_out); \ size_t cluster_span_deal = span_num_deal * coreDim; \ int32_t repeat = cluster_num_deal / cluster_span_deal; \ size_t cluster_remain = cluster_num_deal % cluster_span_deal; \ @@ -92,11 +92,11 @@ __mlu_func__ void strategyOfPartitionCore(size_t remain_num, strategyOfPartitionCore(cluster_remain, core_remain_num_deal, \ core_remain_offset); \ size_t align_remain_num = PAD_UP(core_remain_num_deal, align_num); \ - char *sram_ping = sram_buffer; \ - char *ping_output = nram_buffer; \ - char *ping_input = nram_buffer + output_input_gap; \ - char *auxiliary_a = nram_buffer + auxiliary_a_gap; \ - char *auxiliary_b = nram_buffer + auxiliary_b_gap; \ + int8_t *sram_ping = sram_buffer; \ + int8_t *ping_output = nram_buffer; \ + int8_t *ping_input = nram_buffer + output_input_gap; \ + int8_t *auxiliary_a = nram_buffer + auxiliary_a_gap; \ + int8_t *auxiliary_b = nram_buffer + auxiliary_b_gap; \ if (repeat > 0) { \ __memcpy_async(sram_ping, load_start, cluster_load_size, GDRAM2SRAM); \ __sync_cluster(); \ @@ -107,7 +107,7 @@ __mlu_func__ void strategyOfPartitionCore(size_t remain_num, GDRAM2SRAM); \ __memcpy_async(ping_input, sram_ping + sram_load_offset, core_load_size, \ SRAM2NRAM); \ - __sync_move(); \ + __sync_move(); \ compute##Op##Prefer( \ ping_output, ping_input, auxiliary_a, auxiliary_b, span_num_deal, \ span_num_deal, args...); \ @@ -115,18 +115,16 @@ __mlu_func__ void strategyOfPartitionCore(size_t remain_num, } \ for (int i = 0; i < repeat - 2; i++) { \ int ping_flag = i % 2, pong_flag = (i + 1) % 2; \ - pvLock(); \ __memcpy_async(store_start + i * cluster_store_size + sram_store_offset, \ ping_output + ping_flag * ping_pong_gap, core_store_size, \ NRAM2GDRAM); \ - pvUnlock(); \ __memcpy_async(sram_ping + ping_flag * sram_pong_gap, \ load_start + (i + 2) * cluster_load_size, \ cluster_load_size, GDRAM2SRAM); \ __memcpy_async(ping_input + pong_flag * ping_pong_gap, \ sram_ping + pong_flag * sram_pong_gap + sram_load_offset, \ core_load_size, SRAM2NRAM); \ - __sync_move(); \ + __sync_move(); \ compute##Op##Prefer( \ ping_output + pong_flag * ping_pong_gap, \ ping_input + pong_flag * ping_pong_gap, auxiliary_a, auxiliary_b, \ @@ -134,12 +132,10 @@ __mlu_func__ void strategyOfPartitionCore(size_t remain_num, __sync_cluster(); \ } \ if (repeat > 1) { \ - pvLock(); \ __memcpy_async( \ store_start + (repeat - 2) * cluster_store_size + sram_store_offset, \ ping_output + ((repeat - 2) % 2) * ping_pong_gap, core_store_size, \ NRAM2GDRAM); \ - pvUnlock(); \ } \ if (cluster_remain > 0) { \ __memcpy_async(sram_ping + (repeat % 2) * sram_pong_gap, \ @@ -152,7 +148,7 @@ __mlu_func__ void strategyOfPartitionCore(size_t remain_num, ping_input + ping_pong_flag * ping_pong_gap, \ sram_ping + ping_pong_flag * sram_pong_gap + sram_load_offset, \ core_load_size, SRAM2NRAM); \ - __sync_move(); \ + __sync_move(); \ compute##Op##Prefer( \ ping_output + ping_pong_flag * ping_pong_gap, \ ping_input + ping_pong_flag * ping_pong_gap, auxiliary_a, \ @@ -160,12 +156,10 @@ __mlu_func__ void strategyOfPartitionCore(size_t remain_num, } \ __sync_cluster(); \ if (repeat > 0) { \ - pvLock(); \ __memcpy_async( \ store_start + (repeat - 1) * cluster_store_size + sram_store_offset, \ ping_output + ((repeat - 1) % 2) * ping_pong_gap, core_store_size, \ NRAM2GDRAM); \ - pvUnlock(); \ } \ if (core_remain_num_deal > 0) { \ int ping_pong_flag = repeat % 2; \ @@ -173,25 +167,23 @@ __mlu_func__ void strategyOfPartitionCore(size_t remain_num, sram_ping + ping_pong_flag * sram_pong_gap + \ core_remain_offset * sizeof(DType_in), \ core_remain_num_deal * sizeof(DType_in), SRAM2NRAM); \ - __sync_move(); \ + __sync_move(); \ compute##Op##Prefer( \ ping_output + ping_pong_flag * ping_pong_gap, \ ping_input + ping_pong_flag * ping_pong_gap, auxiliary_a, \ auxiliary_b, align_remain_num, core_remain_num_deal, args...); \ __asm__ volatile("sync;"); \ - pvLock(); \ __memcpy_async(store_start + repeat * cluster_store_size + \ core_remain_offset * sizeof(DType_out), \ ping_output + ping_pong_flag * ping_pong_gap, \ core_remain_num_deal * sizeof(DType_out), NRAM2GDRAM); \ - pvUnlock(); \ } \ } #else #define UNARY_OP_4PIPELINE_IMPLEMENTATION(Op, Prefer) \ template \ __mlu_global__ void MLUBlockKernel4StagePipeline##Op##Prefer( \ - char *input_gdram, char *output_gdram, size_t element_num, \ + int8_t *input_gdram, int8_t *output_gdram, size_t element_num, \ Args... args) {} #endif #endif // KERNELS_UNARY_OP_UNARY_OP_4PIPELINE_H_ diff --git a/kernels/unary_op/unary_op_5pipeline.h b/kernels/unary_op/unary_op_5pipeline.h index 0fb03d467..ccd6aa55f 100644 --- a/kernels/unary_op/unary_op_5pipeline.h +++ b/kernels/unary_op/unary_op_5pipeline.h @@ -35,7 +35,7 @@ #define UNARY_OP_KERNEL_5PIPELINE_DECLARE(Op, Prefer) \ template \ __mlu_global__ void MLUBlockKernel5StagePipeline##Op##Prefer( \ - char *x, char *y, size_t element_num, Args... args); + int8_t *x, int8_t *y, size_t element_num, Args... args); #define UNARY_OP_KERNEL_5PIPELINE_MVCVT_DECLARE(Op, ...) \ template \ @@ -65,7 +65,7 @@ __mlu_func__ void strategyOfPartitionCore(size_t data_num, size_t &num_per_core, #define UNARY_OP_KERNEL_5PIPELINE_IMPLE(Op, Prefer) \ template \ __mlu_global__ void MLUBlockKernel5StagePipeline##Op##Prefer( \ - char *input_gdram, char *output_gdram, size_t element_num, \ + int8_t *input_gdram, int8_t *output_gdram, size_t element_num, \ Args... args) { \ /* The gap of input and output.*/ \ size_t output_input_gap = 0; \ @@ -104,16 +104,16 @@ __mlu_func__ void strategyOfPartitionCore(size_t data_num, size_t &num_per_core, size_t cluster_rem_per_core_align = \ PAD_UP(cluster_rem_per_core, align_num); \ \ - char *load_start = \ - (char *)input_gdram + offset_cluster * sizeof(DType_in); \ - char *store_start = \ - (char *)output_gdram + offset_cluster * sizeof(DType_out); \ + int8_t *load_start = \ + (int8_t *)input_gdram + offset_cluster * sizeof(DType_in); \ + int8_t *store_start = \ + (int8_t *)output_gdram + offset_cluster * sizeof(DType_out); \ \ - char *sram_ping = (char *)sram_buffer; \ - char *nram_output = (char *)nram_buffer; \ - char *nram_input = (char *)nram_buffer + output_input_gap; \ - char *nram_aux_a = (char *)nram_buffer + auxiliary_a_gap; \ - char *nram_aux_b = (char *)nram_buffer + auxiliary_b_gap; \ + int8_t *sram_ping = (int8_t *)sram_buffer; \ + int8_t *nram_output = (int8_t *)nram_buffer; \ + int8_t *nram_input = (int8_t *)nram_buffer + output_input_gap; \ + int8_t *nram_aux_a = (int8_t *)nram_buffer + auxiliary_a_gap; \ + int8_t *nram_aux_b = (int8_t *)nram_buffer + auxiliary_b_gap; \ \ if (repeat > 0) { \ __memcpy_async(sram_ping, load_start, cluster_load_size, GDRAM2SRAM); \ @@ -126,7 +126,7 @@ __mlu_func__ void strategyOfPartitionCore(size_t data_num, size_t &num_per_core, GDRAM2SRAM); \ __memcpy_async(nram_input, sram_ping + sram_load_offset, core_load_size, \ SRAM2NRAM); \ - __sync_move(); \ + __sync_move(); \ compute##Op##Prefer(nram_output, nram_input, \ nram_aux_a, nram_aux_b, \ num_deal, num_deal, args...); \ @@ -147,7 +147,7 @@ __mlu_func__ void strategyOfPartitionCore(size_t data_num, size_t &num_per_core, nram_input, \ sram_ping + ((i + 1) % 2) * sram_pong_gap + sram_load_offset, \ core_load_size, SRAM2NRAM); \ - __sync_move(); \ + __sync_move(); \ compute##Op##Prefer(nram_output, nram_input, \ nram_aux_a, nram_aux_b, \ num_deal, num_deal, args...); \ @@ -175,7 +175,7 @@ __mlu_func__ void strategyOfPartitionCore(size_t data_num, size_t &num_per_core, nram_input, \ sram_ping + ((repeat - 1) % 2) * sram_pong_gap + sram_load_offset, \ core_load_size, SRAM2NRAM); \ - __sync_move(); \ + __sync_move(); \ compute##Op##Prefer(nram_output, nram_input, \ nram_aux_a, nram_aux_b, \ num_deal, num_deal, args...); \ @@ -198,7 +198,7 @@ __mlu_func__ void strategyOfPartitionCore(size_t data_num, size_t &num_per_core, sram_ping + (repeat % 2) * sram_pong_gap + \ offset_core * sizeof(DType_in), \ cluster_rem_per_core * sizeof(DType_in), SRAM2NRAM); \ - __sync_move(); \ + __sync_move(); \ compute##Op##Prefer( \ nram_output, nram_input, nram_aux_a, nram_aux_b, \ cluster_rem_per_core_align, cluster_rem_per_core, args...); \ @@ -218,7 +218,7 @@ __mlu_func__ void strategyOfPartitionCore(size_t data_num, size_t &num_per_core, #define UNARY_OP_KERNEL_5PIPELINE_IMPLE(Op, Prefer) \ template \ __mlu_global__ void MLUBlockKernel5StagePipeline##Op##Prefer( \ - char *input_gdram, char *output_gdram, size_t element_num, \ + int8_t *input_gdram, int8_t *output_gdram, size_t element_num, \ Args... args) {} #endif diff --git a/kernels/unary_op/unary_op_host.cpp b/kernels/unary_op/unary_op_host.cpp index 83c8f5667..2d7ba9c4d 100644 --- a/kernels/unary_op/unary_op_host.cpp +++ b/kernels/unary_op/unary_op_host.cpp @@ -47,7 +47,7 @@ void unaryOpPolicyFunc(mluOpHandle_t handle, cnrtDim3_t *k_dim, tensor_size = CEIL_ALIGN(tensor_size, NFU_ALIGN_SIZE); uint64_t need_core = CEIL_ALIGN(tensor_size / NFU_ALIGN_SIZE, core_in_cluster); - *k_type = CNRT_FUNC_TYPE_UNION1; // default func type + *k_type = cnrtFuncTypeUnion1; // default func type k_dim->x = core_in_cluster; if (need_core < core_number) { k_dim->y = need_core / core_in_cluster; @@ -68,7 +68,7 @@ void unaryOpPolicyFuncBlock(mluOpHandle_t handle, cnrtDim3_t *k_dim, uint32_t core_used = CEIL_ALIGN(data_size, OPTIMAL_BOUNDARY) / OPTIMAL_BOUNDARY; core_used = core_used > core_num ? core_num : core_used; - *k_type = CNRT_FUNC_TYPE_BLOCK; + *k_type = cnrtFuncTypeBlock; k_dim->x = 1; k_dim->y = core_used; k_dim->z = 1; @@ -81,7 +81,7 @@ void unaryOpPolicyFuncBlock_v2(mluOpHandle_t handle, cnrtDim3_t &k_dim, cnrtFunctionType_t &k_type, size_t &normal_core_elem_num, size_t &tail_core_elem_num) { - k_type = CNRT_FUNC_TYPE_BLOCK; + k_type = cnrtFuncTypeBlock; if (MLUOP_MLU590 == handle->arch) { const size_t llc_pending_size = 512; single_core_min_load_size = diff --git a/kernels/unary_op/unary_op_no_pipeline.h b/kernels/unary_op/unary_op_no_pipeline.h index 4a01684e6..8d81f6a7c 100644 --- a/kernels/unary_op/unary_op_no_pipeline.h +++ b/kernels/unary_op/unary_op_no_pipeline.h @@ -30,12 +30,12 @@ #define UNARY_OP_NO_PIPELINE_DECLARE(Op, Prefer) \ template \ __mlu_global__ void MLUBlockKernelNoPipeline##Op##Prefer( \ - char *x, char *y, size_t element_num, Args... args); + int8_t *x, int8_t *y, size_t element_num, Args... args); #define UNARY_OP_NO_PIPELINE_IMPLE(Op, Prefer) \ template \ __mlu_global__ void MLUBlockKernelNoPipeline##Op##Prefer( \ - char *input_gdram, char *output_gdram, size_t element_num, \ + int8_t *input_gdram, int8_t *output_gdram, size_t element_num, \ Args... args) { \ if (__is_mpu()) { \ return; \ @@ -48,9 +48,9 @@ align_num, args...); \ size_t num_per_core = element_num / taskDim; \ size_t num_rem = element_num % taskDim; \ - char *input_start = \ + int8_t *input_start = \ input_gdram + taskId * num_per_core * sizeof(DType_in); \ - char *output_start = \ + int8_t *output_start = \ output_gdram + taskId * num_per_core * sizeof(DType_out); \ if (num_rem > 0 && taskId == taskDim - 1) { \ num_per_core = num_per_core + num_rem; \ @@ -58,10 +58,10 @@ int repeat = num_per_core / span_num_deal; \ size_t rem = num_per_core % span_num_deal; \ size_t align_rem = CEIL_ALIGN(rem, align_num); \ - char *output = nram_buffer; \ - char *input = nram_buffer + output_input_gap; \ - char *auxiliary_a = nram_buffer + auxiliary_a_gap; \ - char *auxiliary_b = nram_buffer + auxiliary_b_gap; \ + int8_t *output = nram_buffer; \ + int8_t *input = nram_buffer + output_input_gap; \ + int8_t *auxiliary_a = nram_buffer + auxiliary_a_gap; \ + int8_t *auxiliary_b = nram_buffer + auxiliary_b_gap; \ size_t span_load_size = span_num_deal * sizeof(DType_in); \ size_t span_store_size = span_num_deal * sizeof(DType_out); \ for (int i = 0; i < repeat; ++i) { \ @@ -70,20 +70,16 @@ compute##Op##Prefer(output, input, auxiliary_a, \ auxiliary_b, span_num_deal, \ span_num_deal, args...); \ - pvLock(); \ __memcpy(output_start + i * span_store_size, output, span_store_size, \ NRAM2GDRAM); \ - pvUnlock(); \ } \ if (rem > 0) { \ __memcpy(input, input_start + repeat * span_load_size, \ rem * sizeof(DType_in), GDRAM2NRAM); \ compute##Op##Prefer( \ output, input, auxiliary_a, auxiliary_b, align_rem, rem, args...); \ - pvLock(); \ __memcpy(output_start + repeat * span_store_size, output, \ rem * sizeof(DType_out), NRAM2GDRAM); \ - pvUnlock(); \ } \ } diff --git a/kernels/unary_op/unary_op_stride_3pipeline.h b/kernels/unary_op/unary_op_stride_3pipeline.h index 463a7ebff..ff2e5c020 100644 --- a/kernels/unary_op/unary_op_stride_3pipeline.h +++ b/kernels/unary_op/unary_op_stride_3pipeline.h @@ -37,13 +37,13 @@ #define UNARY_OP_KERNEL_3PIPELINE_WITH_STRIDE_DECLARE(Op, Prefer) \ template \ __mlu_global__ void MLUBlockKernel3StagePipelineWithStride##Op##Prefer( \ - char *x, mluop::TensorShape x_shape, char *y, \ + int8_t *x, mluop::TensorShape x_shape, int8_t *y, \ mluop::TensorShape y_shape, size_t element_num, Args... args); #define UNARY_OP_KERNEL_3PIPELINE_WITH_STRIDE_IMPLE(Op, Prefer) \ template \ __mlu_global__ void MLUBlockKernel3StagePipelineWithStride##Op##Prefer( \ - char *input_gdram, mluop::TensorShape x_shape, char *output_gdram, \ + int8_t *input_gdram, mluop::TensorShape x_shape, int8_t *output_gdram, \ mluop::TensorShape y_shape, size_t element_num, Args... args) { \ if (__is_mpu()) { \ return; \ @@ -73,11 +73,11 @@ size_t rem = num_per_core % num_deal; \ size_t rem_align = CEIL_ALIGN(rem, align_num); \ \ - char *ping_output = nram_buffer; \ - char *ping_input = nram_buffer + output_input_gap; \ + int8_t *ping_output = nram_buffer; \ + int8_t *ping_input = nram_buffer + output_input_gap; \ /* Two auxiliary pointers.*/ \ - char *auxiliary_a = nram_buffer + auxiliary_a_gap; \ - char *auxiliary_b = nram_buffer + auxiliary_b_gap; \ + int8_t *auxiliary_a = nram_buffer + auxiliary_a_gap; \ + int8_t *auxiliary_b = nram_buffer + auxiliary_b_gap; \ \ if (repeat > 0) { \ tensorStrideLoad(ping_input, input_gdram, task_offset, \ diff --git a/kernels/utils/common.h b/kernels/utils/common.h index 4d70d691c..f4ca0f2af 100644 --- a/kernels/utils/common.h +++ b/kernels/utils/common.h @@ -25,6 +25,7 @@ #ifndef KERNELS_UTILS_COMMON_H_ #define KERNELS_UTILS_COMMON_H_ +#include #include #include "float.h" @@ -68,23 +69,13 @@ __mlu_func__ T __mluop_max(T a, T b) {  * param 'src' is the source pointer in NRAM.  * param 'src_count' is the src element count.  * Note: - * The rounding mode on MLU200 is rd, on MLU300 is rn. + * The rounding mode on MLU300 is rn.  ******************************************************************************/ __mlu_func__ void __mluop_float2half(half *dst, float *src, int src_count) { -#if __BANG_ARCH__ >= 300 __bang_float2half_rn(dst, src, src_count); -#else - __bang_float2half_rd(dst, src, src_count); -#endif } -__mlu_func__ half __mluop_float2half(float a) { -#if __BANG_ARCH__ >= 300 - return __float2half_rn(a); -#else - return __float2half_rd(a); -#endif -} +__mlu_func__ half __mluop_float2half(float a) { return __float2half_rn(a); } /******************************************************************************  * MLUOP FUNC: __mluop_div @@ -165,26 +156,11 @@ __mlu_func__ void __mluop_recip(T *nram_dst, T *nram_src, void *nram_addition, const bool is_high_precision, const uint32_t deal_num) { if (sizeof(T) == sizeof(float)) { -#if __BANG_ARCH__ >= 300 __bang_recip((float *)nram_dst, (float *)nram_src, deal_num); -#else - __bang_active_reciphp((float *)nram_dst, (float *)nram_src, deal_num); -#endif } else if (sizeof(T) == sizeof(half)) { -#if __BANG_ARCH__ >= 300 __bang_half2float((float *)nram_addition, (half *)nram_src, deal_num); __bang_recip((float *)nram_addition, (float *)nram_addition, deal_num); __bang_float2half_rn((half *)nram_dst, (float *)nram_addition, deal_num); -#else - if (is_high_precision) { - __bang_half2float((float *)nram_addition, (half *)nram_src, deal_num); - __bang_active_reciphp((float *)nram_addition, (float *)nram_addition, - deal_num); - __bang_float2half_rd((half *)nram_dst, (float *)nram_addition, deal_num); - } else { - __bang_active_reciphp((half *)nram_dst, (half *)nram_src, deal_num); - } -#endif } else { return; } @@ -204,17 +180,12 @@ template __mlu_func__ void __mluop_exp(T *nram_dst, T *nram_src, void *nram_addition, const int is_high_precision, const int deal_num) { if (sizeof(T) == sizeof(float)) { -#if __BANG_ARCH__ >= 300 int x2d = 0x3fb8aa3b; float log2e = *(float *)&x2d; __bang_mul_scalar((float *)nram_dst, (float *)nram_src, (float)log2e, deal_num); __bang_pow2((float *)nram_dst, (float *)nram_dst, deal_num); -#else - __bang_active_exphp((float *)nram_dst, (float *)nram_src, deal_num); -#endif } else if (sizeof(T) == sizeof(half)) { -#if __BANG_ARCH__ >= 300 int x2d = 0x3fb8aa3b; float log2e = *(float *)&x2d; __bang_half2float((float *)nram_addition, (half *)nram_src, deal_num); @@ -222,16 +193,6 @@ __mlu_func__ void __mluop_exp(T *nram_dst, T *nram_src, void *nram_addition, (float)log2e, deal_num); __bang_pow2((float *)nram_addition, (float *)nram_addition, deal_num); __bang_float2half_rn((half *)nram_dst, (float *)nram_addition, deal_num); -#else - if (is_high_precision) { - __bang_half2float((float *)nram_addition, (half *)nram_src, deal_num); - __bang_active_exphp((float *)nram_addition, (float *)nram_addition, - deal_num); - __bang_float2half_rd((half *)nram_dst, (float *)nram_addition, deal_num); - } else { - __bang_active_exphp((half *)nram_dst, (half *)nram_src, deal_num); - } -#endif } else { return; } @@ -257,14 +218,14 @@ __mlu_func__ void __mluop_log(T *nram_dst, T *nram_src, void *nram_addition, if (sizeof(T) == sizeof(float)) { int x2d = 0x3f317217; float rlog2e = *(float *)&x2d; - __bang_log((float *)nram_dst, (float *)nram_src, deal_num); + __bang_log2((float *)nram_dst, (float *)nram_src, deal_num); __bang_mul_scalar((float *)nram_dst, (float *)nram_dst, (float)rlog2e, deal_num); } else if (sizeof(T) == sizeof(half)) { int x2d = 0x3f317217; float rlog2e = *(float *)&x2d; __bang_half2float((float *)nram_addition, (half *)nram_src, deal_num); - __bang_log((float *)nram_addition, (float *)nram_addition, deal_num); + __bang_log2((float *)nram_addition, (float *)nram_addition, deal_num); __mluop_float2half((half *)nram_dst, (float *)nram_addition, deal_num); __bang_mul_scalar((half *)nram_dst, (half *)nram_dst, (half)rlog2e, deal_num); @@ -289,18 +250,13 @@ __mlu_func__ void __mluop_sigmoid(T *nram_dst, T *nram_src, void *nram_addition, const int is_high_precision, const int deal_num) { if (sizeof(T) == sizeof(float)) { -#if __BANG_ARCH__ >= 300 __bang_mul_scalar((float *)nram_dst, (float *)nram_src, (float)-1.0, deal_num); __mluop_exp((float *)nram_dst, (float *)nram_dst, NULL, 0, deal_num); __bang_add_scalar((float *)nram_dst, (float *)nram_dst, (float)1.0, deal_num); __mluop_recip((float *)nram_dst, (float *)nram_dst, NULL, 0, deal_num); -#else - __bang_active_sigmoid((float *)nram_dst, (float *)nram_src, deal_num); -#endif } else if (sizeof(T) == sizeof(half)) { -#if __BANG_ARCH__ >= 300 __bang_half2float((float *)nram_addition, (half *)nram_src, deal_num); __bang_mul_scalar((float *)nram_addition, (float *)nram_addition, (float)-1.0, deal_num); @@ -310,16 +266,6 @@ __mlu_func__ void __mluop_sigmoid(T *nram_dst, T *nram_src, void *nram_addition, (float)1.0, deal_num); __mluop_recip((float *)nram_dst, (float *)nram_addition, NULL, 0, deal_num); __bang_float2half_rn((half *)nram_dst, (float *)nram_dst, deal_num); -#else - if (is_high_precision) { - __bang_half2float((float *)nram_addition, (half *)nram_src, deal_num); - __bang_active_sigmoid((float *)nram_addition, (float *)nram_addition, - deal_num); - __bang_float2half_rd((half *)nram_dst, (float *)nram_addition, deal_num); - } else { - __bang_active_sigmoid((half *)nram_dst, (half *)nram_src, deal_num); - } -#endif } else { return; } @@ -353,176 +299,6 @@ __mlu_func__ void __mluop_recursive_sum_pool(T *dst, int low_dim, int high_dim, return; } -/***************************************************************************** - * MLUOPS FUNC: __mluop_int322float - * param 'dst' is the destination pointer in NRAM, same memory space as src - * required in NRAM - * param 'dst_addition' is the addition workspace of dst, requiring the same - * amount of space as dst in NRAM - * param 'src' is the source pointer in NRAM - * param 'src_addition' is the addition workspace of src, requiring only 128B - * space in NRAM - * param 'src_count' is the src element count - * Notes: - * the sapces pointed by dst and src can not overlap - * src_count*sizeof(float) should be divisible by 128 - * src input must be in range of [-2^23, 2^23-1] for MLU270 and MLU290 - *****************************************************************************/ -__mlu_func__ void __mluop_int322float(float *dst, float *dst_addition, - int32_t *src, float *src_addition, - int32_t src_count) { -#if __BANG_ARCH__ >= 300 - __bang_int322float((float *)dst, (int32_t *)src, src_count, 0); -#else - // get sign bit - int32_t seg_elem_count = 32; // 128/sizeof(float) = 32 - int32_t float_size = 4; // sizeof(float) = 4 - int32_t align_128 = 128; - float move_23bit = 8388608.0; - // 0x80000000 = 1,000000000,0000000000000000000000000000 - __bang_write_value((unsigned *)src_addition, seg_elem_count, - (unsigned)0x80000000); - __bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition, - src_count * float_size, align_128); - // get 1 or 0 from sign bit - // judge is Odd - __bang_write_value((unsigned *)src_addition, seg_elem_count, - (unsigned)0x00000001); - __bang_cycle_bor((char *)dst_addition, (char *)dst_addition, - (char *)src_addition, src_count * float_size, align_128); - __bang_write_value((unsigned *)src_addition, seg_elem_count, - (unsigned)0x80000001); - __bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count, - seg_elem_count); - // minus xor, positive num invariant - __bang_write_value((unsigned *)src_addition, seg_elem_count, - (unsigned)0xffffffff); - __bang_cycle_mul(dst, dst_addition, src_addition, src_count, seg_elem_count); - __bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * float_size); - // convert int32 to float32 - __bang_write_value((unsigned *)src_addition, seg_elem_count, - (unsigned)0x7fffff); - __bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition, - src_count * float_size, align_128); - __bang_write_value((unsigned *)src_addition, seg_elem_count, - (unsigned)0x4b000000); - __bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition, - src_count * float_size, align_128); - __bang_sub_scalar(dst, dst, move_23bit, src_count); - // add one - __bang_add(dst, dst, dst_addition, src_count); - // set sign for float32 - __bang_write_value((unsigned *)src_addition, seg_elem_count, - (unsigned)0xffffffff); - __bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count, - seg_elem_count); - - // fix on MLU300 - __bang_write_value((unsigned *)src_addition, seg_elem_count, - (unsigned)0x00000001); - __bang_cycle_add(dst_addition, dst_addition, src_addition, src_count, - seg_elem_count); - // end fix - - __bang_write_value((unsigned *)src_addition, seg_elem_count, - (unsigned)0x80000000); - __bang_cycle_band((char *)dst_addition, (char *)dst_addition, - (char *)src_addition, src_count * float_size, align_128); - __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, - src_count * float_size); -#endif -} - -/***************************************************************************** - * MLUOPS FUNC: __mluop_float2int32 - * param 'dst' is the destination pointer in NRAM, same memory space as src - * required in NRAM - * param 'dst_addition' is the addition workspace of dst, requiring the same - * amount of space as dst in NRAM - * param 'src' is the source pointer in NRAM - * param 'src_addition' is the addition workspace of src, requiring only 128B - * space in NRAM - * param 'src_count' is the src element count - * Notes: - * the sapces pointed by dst and src can not overlap - * src_count*sizeof(float) should be divisible by 128 - * src input must be in range of [-2^23, 2^23-1] for MLU270 and MLU290 - *****************************************************************************/ -__mlu_func__ void __mluop_float2int32(int32_t *dst, float *dst_addition, - float *src, float *src_addition, - int32_t src_count) { -#if __BANG_ARCH__ >= 322 - __bang_float2int32_tz((int32_t *)dst, (float *)src, src_count, 0); -#else - // sign ===> src_addition - // dst=-1.0 : when src[i] is a negative number - // dst=+1.0 : when src[i] is a positive number - int32_t floatDchar = sizeof(float) / sizeof(char); - __bang_active_sign((float *)dst, src, src_count); - // dst_addition = abs(src) - __bang_mul(dst_addition, src, (float *)dst, src_count); - // if dst_addition < 1.0, then src_addition + 1. to fix add error - __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), - 1.0f); - __bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count, - NFU_ALIGN_SIZE / sizeof(float)); - __bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count); - __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), - 0xbf800000); - // set negative flag -1.0 = 0xbf80000 - __bang_cycle_eq( - (float *)dst, (float *)dst, (float *)src_addition, src_count, - NFU_ALIGN_SIZE / sizeof(float)); // to mask all src in [x < -1.0] - __bang_active_abs(dst_addition, src, src_count); - __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), - 8388608.0f); - // mask shift move 23 - __bang_cycle_add_tz( - dst_addition, dst_addition, src_addition, src_count, - NFU_ALIGN_SIZE / sizeof(float)); // right shift move 23bit - // dst=1.0, when src < -1.0 - // dst=0.0, when src >=-1.0 - __bang_sub(dst_addition, dst_addition, (float *)dst, src_count); - // to fix max value - __bang_mul_scalar((float *)dst, (float *)dst, 16777215.0, src_count); - __bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst, - src_count * floatDchar); - // get log 23bit - __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), - (unsigned)0x007fffff); - // mask low 23bit is 1 - __bang_cycle_band((char *)dst_addition, (char *)dst_addition, - (char *)src_addition, src_count * floatDchar, - NFU_ALIGN_SIZE / sizeof(char)); - - __bang_write_value(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000); - __bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count, - NFU_ALIGN_SIZE / sizeof(float)); - // src or dst_addition - __bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition, - src_count * floatDchar); - __bang_mul_scalar((float *)dst, (float *)dst, -2.0, src_count); - __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, - src_count * floatDchar); -#endif -} - -__mlu_func__ void pvLock() { -#if __BANG_ARCH__ == 270 - if (__is_ipu()) { - __bang_lock(0, 0); - } -#endif -} - -__mlu_func__ void pvUnlock() { -#if __BANG_ARCH__ == 270 - if (__is_ipu()) { - __bang_unlock(0, 0); - } -#endif -} - /****************************************************************************** * MLUOPS FUNC: __mluop_load_str_2D * param 'size' is the getC size. @@ -711,4 +487,213 @@ __mlu_vector__ void __mluop_get_indices(float *dst, float start_index, } } +template +__mlu_func__ void __mlu_op_arange_base_(T *dst_nram, uint32_t numel, + T start_index, T step) { + for (uint32_t i = 0; i < numel; i++) { + dst_nram[i] = start_index + i * step; + } +} + +#define MLUOP_ARANGE_VV_IMPL(VVType, vv_num, dst_nram, start_index, step) \ + do { \ + VVType vv_index[8]; \ + __vv_index(vv_index[0], start_index, step); \ + __vv_add(vv_index[1], vv_index[0], 1 * vv_num * step); \ + __vv_add(vv_index[2], vv_index[0], 2 * vv_num * step); \ + __vv_add(vv_index[3], vv_index[0], 3 * vv_num * step); \ + __vv_add(vv_index[4], vv_index[0], 4 * vv_num * step); \ + __vv_add(vv_index[5], vv_index[0], 5 * vv_num * step); \ + __vv_add(vv_index[6], vv_index[0], 6 * vv_num * step); \ + __vv_add(vv_index[7], vv_index[0], 7 * vv_num * step); \ + __vv_store(dst_nram, vv_index[0], vv_num); \ + __vv_store(dst_nram + vv_num, vv_index[1], vv_num); \ + __vv_store(dst_nram + 2 * vv_num, vv_index[2], vv_num); \ + __vv_store(dst_nram + 3 * vv_num, vv_index[3], vv_num); \ + __vv_store(dst_nram + 4 * vv_num, vv_index[4], vv_num); \ + __vv_store(dst_nram + 5 * vv_num, vv_index[5], vv_num); \ + __vv_store(dst_nram + 6 * vv_num, vv_index[6], vv_num); \ + __vv_store(dst_nram + 7 * vv_num, vv_index[7], vv_num); \ + } while (false) + +template +__mlu_vector__ void __mlu_op_arange_vv_(T *dst_nram, T start_index, T step) { +#if 592 < _BANG_ARCH_ + static_assert( + (std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value), + "__mlu_op_arange_vv type error!"); +#else // #if 592 < _BANG_ARCH_ + static_assert( + (std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value), + "__mlu_op_arange_vv type error!"); +#endif + + const uint32_t vv_num = __vv_get_length() / sizeof(T); + +#if _BANG_ARCH_ <= 592 + if (std::is_same::value) { + MLUOP_ARANGE_VV_IMPL(vv_uint16, vv_num, dst_nram, start_index, step); + } else if (std::is_same::value) { + MLUOP_ARANGE_VV_IMPL(vv_int16, vv_num, dst_nram, start_index, step); + } else if (std::is_same::value) { + MLUOP_ARANGE_VV_IMPL(vv_uint32, vv_num, dst_nram, start_index, step); + } else if (std::is_same::value) { + MLUOP_ARANGE_VV_IMPL(vv_int32, vv_num, dst_nram, start_index, step); + } +#endif // if _BANG_ARCH_ <= 592 + if (std::is_same::value) { + MLUOP_ARANGE_VV_IMPL(vv_uint16, vv_num, dst_nram, start_index, step); + } else if (std::is_same::value) { + MLUOP_ARANGE_VV_IMPL(vv_int16, vv_num, dst_nram, start_index, step); + } else if (std::is_same::value) { + MLUOP_ARANGE_VV_IMPL(vv_float, vv_num, dst_nram, start_index, step); + } else if (std::is_same::value) { + MLUOP_ARANGE_VV_IMPL(vv_half, vv_num, dst_nram, start_index, step); + } + return; +} + +#if 592 < _BANG_ARCH_ +template +__mlu_func__ void __mlu_op_gen_integer_incr_seq_(T *dst_nram, + uint32_t elem_count, + T start = 0, T step = 1) { + static_assert( + (std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same), + "__mlu_op_gen_integer_incr_seq type error!"); + if (std::is_same::value) { + __bang_incseq(reinterpret_cast(dst_nram), elem_count); + } else if (std::is_same::value) { + __bang_incseq(reinterpret_cast(dst_nram), elem_count); + } else { + __bang_incseq(dst_nram, elem_count); + } + + if (start != 0) { + if (std::is_same::value || std::is_same::value) { + if (step != 1) { + __bang_mul_scalar(dst_nram, dst_nram, step, elem_count); + } + __bang_add_scalar(dst_nram, dst_nram, start, elem_count); + } else { + __bang_fusion(FUSION_FMA, dst_nram, dst_nram, step, start, elem_count); + } + } +} +#endif // if 592 < _BANG_ARCH_ + +#define u32_sizeof(T) ((uint32_t)sizeof(T)) + +template +__mlu_func__ void __mlu_op_arange_by_expand_(T *dst_nram, uint32_t numel, + T start_index = 0, T step = 1) { +#if 592 < _BANG_ARCH_ + static_assert( + (std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value), + "__mlu_op_arange_by_expand type error!"); +#else // if 592 < _BANG_ARCH_ + static_assert( + (std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value), + "__mlu_op_arange_by_expand type error!"); +#endif // if 592 < _BANG_ARCH_ + + // using AluGenSize = std::integral_constant; + using GuGenSize = std::integral_constant; + uint32_t gu_gen_num = GuGenSize::value / u32_sizeof(T); + uint32_t alu_gen_num = NFU_ALIGN_SIZE / u32_sizeof(T); + uint32_t base_num = alu_gen_num; +#if _BANG_ARCH_ <= 592 + if (std::is_same::value || std::is_same::value) { + const uint32_t prologue_num = std::min(numel, base_num); + __mlu_op_arange_base_(dst_nram, prologue_num, start_index, step); + + if (numel <= base_num) { + return; + } + } else { + if (numel <= gu_gen_num) { + const uint32_t prologue_num = std::min(numel, base_num); + __mlu_op_arange_base_(dst_nram, prologue_num, start_index, step); + + if (numel <= base_num) { + return; + } + } else { + __mlu_op_arange_vv_(dst_nram, start_index, step); + base_num = gu_gen_num; + } + } +#else + if (numel <= gu_gen_num) { + const uint32_t prologue_num = std::min(numel, base_num); + __mlu_op_arange_base_(dst_nram, prologue_num, start_index, step); + + if (numel <= base_num) { + return; + } + } else { + __mlu_op_arange_vv_(dst_nram, start_index, step); + base_num = gu_gen_num; + } +#endif + // base_num = 2^exp + uint32_t exp = 0; + asm volatile("findlast1.gpr.b32 %[dst], %[src];\n\t" + : [ dst ] "+&r"(exp) + : [ src ] "r"(base_num)); + // numel = count * base_num + remain + const uint32_t segnum = numel >> exp; + // count = 2^repeat + uint32_t repeat = 0; + asm volatile("findlast1.gpr.b32 %[dst], %[src];\n\t" + : [ dst ] "+&r"(repeat) + : [ src ] "r"(segnum)); + uint32_t count = 1; + for (uint32_t i = 0; i < repeat; ++i) { + __bang_add_scalar(dst_nram + count * base_num, dst_nram, + count * base_num * step, count * base_num); + count *= 2; + } + + const uint32_t remain = numel - count * base_num; + if (0 < remain) { + __bang_add_scalar(dst_nram + count * base_num, dst_nram, + count * base_num * step, remain); + } +} +/*************************************************************************** + + CNNL FUNC: __mlu_op_gen_stage_index. + param "dst_nram" is a nram pointer to the generated result. + param "numel" is the element number of to be generated. + param "start_index" is the starting value for the set of points. Default: 0. + param "step" is the gap between each pair of adjacent points points. + Default: 1. dst_addition. remarks: Detailed introduction for reference + http://wiki.cambricon.com/pages/viewpage.action?pageId=119467501. + int64_t and uint64_t types are under-optimized and can be improved with GU. + *************************************************************************/ + +template +__mlu_func__ void __mlu_op_gen_stage_index(T *dst_nram, uint32_t numel, + T start_index = 0, T step = 1) { +#if 592 < _BANG_ARCH_ + if (std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value) { + __mlu_op_gen_integer_incr_seq_(dst_nram, numel, start_index, step); + } else { + __mlu_op_arange_by_expand_(dst_nram, numel, start_index, step); + } +#else + __mlu_op_arange_by_expand_(dst_nram, numel, start_index, step); +#endif +} + #endif // KERNELS_UTILS_COMMON_H_ diff --git a/kernels/voxel_pooling_forward/voxel_pooling_forward.cpp b/kernels/voxel_pooling_forward/voxel_pooling_forward.cpp index 1289d5a84..e9c33167f 100644 --- a/kernels/voxel_pooling_forward/voxel_pooling_forward.cpp +++ b/kernels/voxel_pooling_forward/voxel_pooling_forward.cpp @@ -36,7 +36,7 @@ static void policyFunc(const mluOpHandle_t handle, const int num_points_total, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) { uint32_t cluster_num = mluop::runtime::getClusterLimitCapability(handle); uint32_t core_in_cluster = handle->core_num_per_cluster; - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; k_dim->x = core_in_cluster; uint32_t use_cluster = (num_points_total + core_in_cluster - 1) / core_in_cluster; diff --git a/kernels/voxel_pooling_forward/voxel_pooling_forward_union1.mlu b/kernels/voxel_pooling_forward/voxel_pooling_forward_union1.mlu index cc8242e83..90ecc8363 100644 --- a/kernels/voxel_pooling_forward/voxel_pooling_forward_union1.mlu +++ b/kernels/voxel_pooling_forward/voxel_pooling_forward_union1.mlu @@ -26,7 +26,7 @@ #include "kernels/kernel.h" #include "kernels/utils/common.h" -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; #define BITINDEX_BYTED_ALIGNED 32 #define MAXNUM_PERF_SUPPORT (2147483648 / 4) @@ -79,7 +79,6 @@ __mlu_func__ void MLUKernelVoxelPoolingDefaultKernel( const int num_voxel_x, const int num_voxel_y, const int num_voxel_z, const int split_num, const int *geom_xyz, const float *input_features, float *output_features, int *pos_memo) { -#if __BANG_ARCH__ >= 322 if (__is_mpu()) { return; } @@ -232,7 +231,6 @@ __mlu_func__ void MLUKernelVoxelPoolingDefaultKernel( } } } -#endif } __mlu_func__ void MLUKernelVoxelPoolingStageTwoPerfKernel( @@ -416,7 +414,6 @@ __mlu_global__ void MLUKernelVoxelPoolingForward( const int num_voxel_x, const int num_voxel_y, const int num_voxel_z, const int *geom_xyz, const float *input_features, float *output_features, int *pos_memo) { -#if __BANG_ARCH__ >= 370 if (__is_mpu()) { return; } @@ -435,7 +432,6 @@ __mlu_global__ void MLUKernelVoxelPoolingForward( num_voxel_z, split_num, geom_xyz, input_features, output_features, pos_memo); } -#endif } mluOpStatus_t MLUOP_WIN_API KernelVoxelPoolingForward( diff --git a/kernels/voxelization/voxelization.cpp b/kernels/voxelization/voxelization.cpp index 709788593..6812bb894 100644 --- a/kernels/voxelization/voxelization.cpp +++ b/kernels/voxelization/voxelization.cpp @@ -40,7 +40,7 @@ static void policyFuncDefault(const mluOpHandle_t handle, std::min((num_points + k_dim->x - 1) / k_dim->x, (size_t)mluop::runtime::getClusterLimitCapability(handle)); k_dim->z = 1; - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; } static void policyFuncCalcPointsPerVoxel(const mluOpHandle_t handle, @@ -50,7 +50,7 @@ static void policyFuncCalcPointsPerVoxel(const mluOpHandle_t handle, k_dim->x = 1; k_dim->y = 1; k_dim->z = 1; - *k_type = CNRT_FUNC_TYPE_BLOCK; + *k_type = cnrtFuncTypeBlock; } mluOpStatus_t voxelizationParamCheck( @@ -284,13 +284,13 @@ mluOpStatus_t MLUOP_WIN_API mluOpVoxelization( void *temp_coors = workspace; // point_to_pointidx : [num_points] void *point_to_pointidx = - (char *)temp_coors + num_points * 3 * sizeof(int32_t); + (int8_t *)temp_coors + num_points * 3 * sizeof(int32_t); // point_to_voxelidx : [num_points] void *point_to_voxelidx = - (char *)point_to_pointidx + num_points * sizeof(int32_t); + (int8_t *)point_to_pointidx + num_points * sizeof(int32_t); // coor_to_voxelidx : [num_points] void *coor_to_voxelidx = - (char *)point_to_voxelidx + num_points * sizeof(int32_t); + (int8_t *)point_to_voxelidx + num_points * sizeof(int32_t); cnrtDim3_t k_dim; cnrtFunctionType_t k_type; diff --git a/kernels/voxelization/voxelization_kernel.mlu b/kernels/voxelization/voxelization_kernel.mlu index 466a50836..0d76c106e 100644 --- a/kernels/voxelization/voxelization_kernel.mlu +++ b/kernels/voxelization/voxelization_kernel.mlu @@ -29,7 +29,7 @@ #include "kernels/kernel.h" #include "kernels/utils/common.h" -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; template __mlu_func__ bool containNanInf(const T pos1, const T pos2) { @@ -40,10 +40,9 @@ __mlu_func__ bool containNanInf(const T pos1, const T pos2) { return false; } -#if __BANG_ARCH__ >= 322 __mlu_func__ void computeDynamicVoxelize( - char *points_x, char *points_y, char *points_z, char *auxiliary_a, - char *auxiliary_b, char *auxiliary_c, const float coors_x_min, + int8_t *points_x, int8_t *points_y, int8_t *points_z, int8_t *auxiliary_a, + int8_t *auxiliary_b, int8_t *auxiliary_c, const float coors_x_min, const float coors_y_min, const float coors_z_min, const float voxel_x, const float voxel_y, const float voxel_z, const int32_t grid_x, const int32_t grid_y, const int32_t grid_z, const int32_t deal_num) { @@ -125,8 +124,8 @@ __mlu_func__ void computeDynamicVoxelize( deal_num); } -__mlu_func__ void computePoint2Voxel(char *coors_x, char *coors_y, - char *coors_z, const int32_t c_x, +__mlu_func__ void computePoint2Voxel(int8_t *coors_x, int8_t *coors_y, + int8_t *coors_z, const int32_t c_x, const int32_t c_y, const int32_t c_z, const int32_t max_points, int32_t *num, int32_t *first_point, @@ -153,14 +152,12 @@ __mlu_func__ void computePoint2Voxel(char *coors_x, char *coors_y, *num += (int32_t)__bang_count((float *)coors_x, deal_num); } } -#endif __mlu_global__ void mluDynamicVoxelize(const float *points, const float *voxel_size_gdram, const float *coors_range_gdram, int32_t *coors, const int32_t num_points, const int32_t num_features) { -#if __BANG_ARCH__ >= 322 if (__is_mpu()) { return; } @@ -206,12 +203,12 @@ __mlu_global__ void mluDynamicVoxelize(const float *points, const int32_t rem = points_per_core % deal_num; const int32_t ping_pong_gap = 3 * deal_num * sizeof(float); - char *points_x = nram_buffer; - char *points_y = points_x + deal_num * sizeof(float); - char *points_z = points_y + deal_num * sizeof(float); - char *auxiliary_a = points_x + 2 * ping_pong_gap; - char *auxiliary_b = auxiliary_a + deal_num * sizeof(float); - char *auxiliary_c = auxiliary_b + deal_num * sizeof(float); + int8_t *points_x = nram_buffer; + int8_t *points_y = points_x + deal_num * sizeof(float); + int8_t *points_z = points_y + deal_num * sizeof(float); + int8_t *auxiliary_a = points_x + 2 * ping_pong_gap; + int8_t *auxiliary_b = auxiliary_a + deal_num * sizeof(float); + int8_t *auxiliary_c = auxiliary_b + deal_num * sizeof(float); int32_t *coors_z_start = coors + points_start; int32_t *coors_y_start = coors + num_points + points_start; @@ -250,7 +247,6 @@ __mlu_global__ void mluDynamicVoxelize(const float *points, } for (int32_t i = 0; i < repeat - 2; ++i) { - pvLock(); __memcpy_async(coors_x_start + i * deal_num, points_x + (i % 2) * ping_pong_gap, deal_num * sizeof(int32_t), NRAM2GDRAM); @@ -260,7 +256,7 @@ __mlu_global__ void mluDynamicVoxelize(const float *points, __memcpy_async(coors_z_start + i * deal_num, points_z + (i % 2) * ping_pong_gap, deal_num * sizeof(int32_t), NRAM2GDRAM); - pvUnlock(); + __memcpy_async(points_x + (i % 2) * ping_pong_gap, points + (points_start + (i + 2) * deal_num) * num_features, sizeof(float), GDRAM2NRAM, sizeof(float), @@ -285,7 +281,6 @@ __mlu_global__ void mluDynamicVoxelize(const float *points, } if (repeat >= 2) { - pvLock(); __memcpy_async(coors_x_start + (repeat - 2) * deal_num, points_x + (repeat % 2) * ping_pong_gap, deal_num * sizeof(int32_t), NRAM2GDRAM); @@ -295,7 +290,6 @@ __mlu_global__ void mluDynamicVoxelize(const float *points, __memcpy_async(coors_z_start + (repeat - 2) * deal_num, points_z + (repeat % 2) * ping_pong_gap, deal_num * sizeof(int32_t), NRAM2GDRAM); - pvUnlock(); } if (rem > 0) { __memcpy_async(points_x + (repeat % 2) * ping_pong_gap, @@ -324,7 +318,6 @@ __mlu_global__ void mluDynamicVoxelize(const float *points, __sync(); if (repeat > 0) { - pvLock(); __memcpy_async(coors_x_start + (repeat - 1) * deal_num, points_x + ((repeat - 1) % 2) * ping_pong_gap, deal_num * sizeof(int32_t), NRAM2GDRAM); @@ -334,7 +327,6 @@ __mlu_global__ void mluDynamicVoxelize(const float *points, __memcpy_async(coors_z_start + (repeat - 1) * deal_num, points_z + ((repeat - 1) % 2) * ping_pong_gap, deal_num * sizeof(int32_t), NRAM2GDRAM); - pvUnlock(); } if (rem > 0) { computeDynamicVoxelize(points_x + (repeat % 2) * ping_pong_gap, @@ -344,7 +336,6 @@ __mlu_global__ void mluDynamicVoxelize(const float *points, coors_z_min, voxel_x, voxel_y, voxel_z, grid_x, grid_y, grid_z, rem); __sync(); - pvLock(); __memcpy_async(coors_x_start + repeat * deal_num, points_x + (repeat % 2) * ping_pong_gap, rem * sizeof(int32_t), NRAM2GDRAM); @@ -354,16 +345,13 @@ __mlu_global__ void mluDynamicVoxelize(const float *points, __memcpy_async(coors_z_start + repeat * deal_num, points_z + (repeat % 2) * ping_pong_gap, rem * sizeof(int32_t), NRAM2GDRAM); - pvUnlock(); } -#endif } __mlu_global__ void mluPoint2Voxel(int32_t *coors, int32_t *point_to_pointidx, int32_t *point_to_voxelidx, const int32_t num_points, const int32_t max_points) { -#if __BANG_ARCH__ >= 322 if (__is_mpu()) { return; } @@ -373,9 +361,9 @@ __mlu_global__ void mluPoint2Voxel(int32_t *coors, int32_t *point_to_pointidx, FLOOR_ALIGN(MAX_NRAM_SIZE / split_num / sizeof(int32_t), NFU_ALIGN_SIZE); const int32_t ping_pong_gap = 3 * deal_num * sizeof(int32_t); - char *coors_x = nram_buffer; - char *coors_y = coors_x + deal_num * sizeof(int32_t); - char *coors_z = coors_y + deal_num * sizeof(int32_t); + int8_t *coors_x = nram_buffer; + int8_t *coors_y = coors_x + deal_num * sizeof(int32_t); + int8_t *coors_z = coors_y + deal_num * sizeof(int32_t); int32_t *coors_z_start = coors; int32_t *coors_y_start = coors + num_points; @@ -467,7 +455,6 @@ __mlu_global__ void mluPoint2Voxel(int32_t *coors, int32_t *point_to_pointidx, point_to_voxelidx[point_idx] = -1; } } -#endif } __mlu_global__ void mluCalcPointsPerVoxel( @@ -702,7 +689,6 @@ __mlu_global__ void mluAssignVoxelsCoors( int32_t *coor_to_voxelidx, float *voxels, int32_t *coors, const int32_t max_points, const int32_t num_points, const int32_t num_features) { -#if __BANG_ARCH__ >= 322 if (__is_mpu()) { return; } @@ -754,7 +740,6 @@ __mlu_global__ void mluAssignVoxelsCoors( } } __sync(); -#endif } mluOpStatus_t MLUOP_WIN_API KernelDynamicVoxelize( diff --git a/kernels/yolo_box/yolo_box.cpp b/kernels/yolo_box/yolo_box.cpp index eb082ebdd..7d1c6d986 100644 --- a/kernels/yolo_box/yolo_box.cpp +++ b/kernels/yolo_box/yolo_box.cpp @@ -37,7 +37,7 @@ static void policyFunc(const mluOpHandle_t handle, const int kw_num, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) { - *k_type = CNRT_FUNC_TYPE_BLOCK; + *k_type = cnrtFuncTypeBlock; uint32_t cluster_num = mluop::runtime::getClusterLimitCapability(handle); uint32_t core_num_per_cluster = mluop::runtime::getCoreNumOfEachUnionCapability(handle); diff --git a/kernels/yolo_box/yolo_box_block.mlu b/kernels/yolo_box/yolo_box_block.mlu index 87994c52d..b258351ab 100644 --- a/kernels/yolo_box/yolo_box_block.mlu +++ b/kernels/yolo_box/yolo_box_block.mlu @@ -26,7 +26,7 @@ #include "kernels/kernel.h" #include "kernels/utils/common.h" -__nram__ char nram_buffer[MAX_NRAM_SIZE]; +__nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; template __mlu_func__ void memCopy(T *dst, const T *src, const int size, @@ -147,7 +147,6 @@ __mlu_func__ void compute(T *nram_x, T *nram_y, T *nram_w, T *nram_h, T *nram_conf_p = nram_conf + offset; T *nram_iou_p = nram_iou + offset; -#if __BANG_ARCH__ >= 322 // compute mask __mluop_sigmoid(nram_conf_p, nram_conf_p, NULL, 0, deal_num); if (iou_aware == true) { @@ -157,11 +156,11 @@ __mlu_func__ void compute(T *nram_x, T *nram_y, T *nram_w, T *nram_h, } else if ((T)iou_aware_factor == (T)1.0) { __bang_write_value(nram_conf_p, deal_num, (T)1.0); } else { - __bang_log(nram_iou_p, nram_iou_p, deal_num); + __bang_log2(nram_iou_p, nram_iou_p, deal_num); __bang_mul_scalar(nram_iou_p, nram_iou_p, (T)iou_aware_factor, deal_num); __bang_pow2(nram_iou_p, nram_iou_p, deal_num); - __bang_log(nram_conf_p, nram_conf_p, deal_num); + __bang_log2(nram_conf_p, nram_conf_p, deal_num); __bang_mul_scalar(nram_conf_p, nram_conf_p, (T)1.0 - (T)iou_aware_factor, deal_num); __bang_pow2(nram_conf_p, nram_conf_p, deal_num); @@ -234,126 +233,14 @@ __mlu_func__ void compute(T *nram_x, T *nram_y, T *nram_w, T *nram_h, __bang_float2int32(nram_mask_int32, nram_iou_p, deal_num, 0); __bang_mul_scalar((int *)nram_mask_int32, (int *)nram_mask_int32, (int)0xffffffff, deal_num); - __bang_band((char *)nram_x_p, (char *)nram_x_p, (char *)nram_mask_int32, + __bang_band((int8_t *)nram_x_p, (int8_t *)nram_x_p, (int8_t *)nram_mask_int32, 4 * deal_num); - __bang_band((char *)nram_y_p, (char *)nram_y_p, (char *)nram_mask_int32, + __bang_band((int8_t *)nram_y_p, (int8_t *)nram_y_p, (int8_t *)nram_mask_int32, 4 * deal_num); - __bang_band((char *)nram_w_p, (char *)nram_w_p, (char *)nram_mask_int32, + __bang_band((int8_t *)nram_w_p, (int8_t *)nram_w_p, (int8_t *)nram_mask_int32, 4 * deal_num); - __bang_band((char *)nram_h_p, (char *)nram_h_p, (char *)nram_mask_int32, + __bang_band((int8_t *)nram_h_p, (int8_t *)nram_h_p, (int8_t *)nram_mask_int32, 4 * deal_num); -#else - const int x2d = 0x3fb8aa3b; - float log2e = *(float *)&x2d; - __bang_mul_scalar(nram_conf_p, nram_conf_p, (float)-1.0, deal_num); - __bang_mul_scalar(nram_conf_p, nram_conf_p, log2e, deal_num); - if (iou_aware == true) { - __bang_mul_scalar(nram_iou_p, nram_iou_p, (float)-1.0, deal_num); - __bang_mul_scalar(nram_iou_p, nram_iou_p, log2e, deal_num); - } - __bang_mul_scalar(nram_x_p, nram_x_p, (float)-1.0, deal_num); - __bang_mul_scalar(nram_x_p, nram_x_p, log2e, deal_num); - __bang_mul_scalar(nram_y_p, nram_y_p, (float)-1.0, deal_num); - __bang_mul_scalar(nram_y_p, nram_y_p, log2e, deal_num); - __bang_mul_scalar(nram_w_p, nram_w_p, log2e, deal_num); - __bang_mul_scalar(nram_h_p, nram_h_p, log2e, deal_num); - - for (int k = 0; k < deal_num; ++k) { - // sigmoid(conf) - nram_conf_p[k] = powf((T)2.0, nram_conf_p[k]); - nram_conf_p[k] = (T)1.0 / ((T)1.0 + nram_conf_p[k]); - - if (iou_aware == true) { - // sigmoid(iou) - nram_iou_p[k] = powf((T)2.0, nram_iou_p[k]); - nram_iou_p[k] = (T)1.0 / ((T)1.0 + nram_iou_p[k]); - - // pow(iou, iou_aware_factor) - nram_iou_p[k] = powf(nram_iou_p[k], (T)iou_aware_factor); - // pow(iou, 1-iou_aware_factor) - nram_conf_p[k] = powf(nram_conf_p[k], (T)1.0 - (T)iou_aware_factor); - nram_conf_p[k] = nram_conf_p[k] * nram_iou_p[k]; - } - - if (nram_conf_p[k] < (T)conf_thresh) { - nram_x_p[k] = (T)0.0; - nram_y_p[k] = (T)0.0; - nram_w_p[k] = (T)0.0; - nram_h_p[k] = (T)0.0; - } else { - // sigmoid(x) - nram_x_p[k] = powf((T)2.0, nram_x_p[k]); - nram_x_p[k] = (T)1.0 / ((T)1.0 + nram_x_p[k]); - - // sigmoid(y) - nram_y_p[k] = powf((T)2.0, nram_y_p[k]); - nram_y_p[k] = (T)1.0 / ((T)1.0 + nram_y_p[k]); - - // exp(w) and exp(h) - nram_w_p[k] = powf((T)2.0, nram_w_p[k]); - nram_h_p[k] = powf((T)2.0, nram_h_p[k]); - } - } - - __bang_write_value(nram_iou_p, deal_num, (T)conf_thresh); - __bang_ge(nram_iou_p, nram_conf_p, nram_iou_p, deal_num); - - // bx0 - __bang_mul_scalar(nram_x_p, nram_x_p, (T)scale, deal_num); - __bang_add_scalar(nram_x_p, nram_x_p, bias, deal_num); - __bang_add(nram_x_p, nram_x_p, nram_cx, deal_num); - __bang_mul(nram_x_p, nram_x_p, nram_imgw, deal_num); - __bang_mul_scalar(nram_x_p, nram_x_p, (T)1.0 / gridw, deal_num); - - // by0 - __bang_mul_scalar(nram_y_p, nram_y_p, (T)scale, deal_num); - __bang_add_scalar(nram_y_p, nram_y_p, bias, deal_num); - __bang_add(nram_y_p, nram_y_p, nram_cy, deal_num); - __bang_mul(nram_y_p, nram_y_p, nram_imgh, deal_num); - __bang_mul_scalar(nram_y_p, nram_y_p, (T)1.0 / gridh, deal_num); - - // bw - __bang_mul(nram_w_p, nram_w_p, anchors_w, deal_num); - __bang_mul(nram_w_p, nram_w_p, nram_imgw, deal_num); - __bang_mul_scalar(nram_w_p, nram_w_p, (T)1.0 / inputw, deal_num); - - // bh - __bang_mul(nram_h_p, nram_h_p, anchors_h, deal_num); - __bang_mul(nram_h_p, nram_h_p, nram_imgh, deal_num); - __bang_mul_scalar(nram_h_p, nram_h_p, (T)1.0 / inputh, deal_num); - - // bx0 = bx - bw/2; - // by0 = by - bh/2; - // bx1 = bx + bw/2; - // by1 = by + bh/2; - __bang_mul_scalar(nram_conf_p, nram_w_p, (T)0.5, deal_num); - __bang_add(nram_w_p, nram_x_p, nram_conf_p, deal_num); - __bang_sub(nram_x_p, nram_x_p, nram_conf_p, deal_num); - - __bang_mul_scalar(nram_conf_p, nram_h_p, (T)0.5, deal_num); - __bang_add(nram_h_p, nram_y_p, nram_conf_p, deal_num); - __bang_sub(nram_y_p, nram_y_p, nram_conf_p, deal_num); - - for (int k = 0; k < deal_num; k++) { - if (clip_bbox == true) { - nram_x_p[k] = nram_x_p[k] > (T)0.0 ? nram_x_p[k] : (T)0.0; - nram_y_p[k] = nram_y_p[k] > (T)0.0 ? nram_y_p[k] : (T)0.0; - nram_w_p[k] = nram_w_p[k] < (nram_imgw[k] - (T)1.0) - ? nram_w_p[k] - : (nram_imgw[k] - (T)1.0); - nram_h_p[k] = nram_h_p[k] < (nram_imgh[k] - (T)1.0) - ? nram_h_p[k] - : (nram_imgh[k] - (T)1.0); - } - - if (nram_iou_p[k] == (T)0.0) { - nram_x_p[k] = (T)0.0; - nram_y_p[k] = (T)0.0; - nram_w_p[k] = (T)0.0; - nram_h_p[k] = (T)0.0; - } - } -#endif } template @@ -1144,7 +1031,6 @@ __mlu_func__ void computeScore(T *nram_iou, T *nram_conf, T *nram_cls, T *nram_conf_p = nram_conf + offset; T *nram_cls_p = nram_cls + offset; -#if __BANG_ARCH__ >= 322 // compute mask __mluop_sigmoid(nram_conf_p, nram_conf_p, NULL, 0, deal_num); if (iou_aware == true) { @@ -1154,11 +1040,11 @@ __mlu_func__ void computeScore(T *nram_iou, T *nram_conf, T *nram_cls, } else if ((T)iou_aware_factor == (T)1.0) { __bang_write_value(nram_conf_p, deal_num, (T)1.0); } else { - __bang_log(nram_iou_p, nram_iou_p, deal_num); + __bang_log2(nram_iou_p, nram_iou_p, deal_num); __bang_mul_scalar(nram_iou_p, nram_iou_p, (T)iou_aware_factor, deal_num); __bang_pow2(nram_iou_p, nram_iou_p, deal_num); - __bang_log(nram_conf_p, nram_conf_p, deal_num); + __bang_log2(nram_conf_p, nram_conf_p, deal_num); __bang_mul_scalar(nram_conf_p, nram_conf_p, (T)1.0 - (T)iou_aware_factor, deal_num); __bang_pow2(nram_conf_p, nram_conf_p, deal_num); @@ -1181,63 +1067,9 @@ __mlu_func__ void computeScore(T *nram_iou, T *nram_conf, T *nram_cls, __bang_float2int32(nram_mask_int32, nram_iou_p, deal_num, 0); __bang_mul_scalar((int *)nram_mask_int32, (int *)nram_mask_int32, (int)0xffffffff, deal_num); - __bang_cycle_band((char *)nram_cls_p, (char *)nram_cls_p, - (char *)nram_mask_int32, 4 * class_num * deal_num, - 4 * deal_num); -#else - const int x2d = 0x3fb8aa3b; - float log2e = *(float *)&x2d; - __bang_mul_scalar(nram_conf_p, nram_conf_p, (float)-1.0, deal_num); - __bang_mul_scalar(nram_conf_p, nram_conf_p, log2e, deal_num); - if (iou_aware == true) { - __bang_mul_scalar(nram_iou_p, nram_iou_p, (float)-1.0, deal_num); - __bang_mul_scalar(nram_iou_p, nram_iou_p, log2e, deal_num); - } - __bang_mul_scalar(nram_cls_p, nram_cls_p, (float)-1.0, class_num * deal_num); - __bang_mul_scalar(nram_cls_p, nram_cls_p, log2e, class_num * deal_num); - - int *nram_mask_int32 = (int *)nram_iou_p; - - for (int k = 0; k < deal_num; ++k) { - // sigmoid(conf) - nram_conf_p[k] = powf((T)2.0, nram_conf_p[k]); - nram_conf_p[k] = (T)1.0 / ((T)1.0 + nram_conf_p[k]); - - if (iou_aware == true) { - // sigmoid(iou) - nram_iou_p[k] = powf((T)2.0, nram_iou_p[k]); - nram_iou_p[k] = (T)1.0 / ((T)1.0 + nram_iou_p[k]); - - // pow(iou, iou_aware_factor) - nram_iou_p[k] = powf(nram_iou_p[k], (T)iou_aware_factor); - // pow(iou, 1-iou_aware_factor) - nram_conf_p[k] = powf(nram_conf_p[k], (T)1.0 - (T)iou_aware_factor); - nram_conf_p[k] = nram_conf_p[k] * nram_iou_p[k]; - } - - if (nram_conf_p[k] < (T)conf_thresh) { - nram_mask_int32[k] = 0; - } else { - nram_mask_int32[k] = 0xffffffff; - } - - for (int ci = 0; ci < class_num; ++ci) { - // sigmoid(cls) - nram_cls_p[ci * deal_num + k] = - powf((T)2.0, nram_cls_p[ci * deal_num + k]); - nram_cls_p[ci * deal_num + k] = - (T)1.0 / ((T)1.0 + nram_cls_p[ci * deal_num + k]); - } - } - - __bang_cycle_mul(nram_cls_p, nram_cls_p, nram_conf_p, class_num * deal_num, - deal_num); - // mask, set 0 - __bang_cycle_band((char *)nram_cls_p, (char *)nram_cls_p, - (char *)nram_mask_int32, 4 * class_num * deal_num, + __bang_cycle_band((int8_t *)nram_cls_p, (int8_t *)nram_cls_p, + (int8_t *)nram_mask_int32, 4 * class_num * deal_num, 4 * deal_num); - -#endif } template diff --git a/mlu_op.h b/mlu_op.h index 5e475eb02..e3c2e4754 100644 --- a/mlu_op.h +++ b/mlu_op.h @@ -362,7 +362,7 @@ typedef enum { typedef enum { MLUOP_REDUCE_DSUM = 0, /*!< Computes the sum value. */ MLUOP_REDUCE_DMEAN = 1, /*!< Computes the mean value. */ - MLUOP_REDUCE_DMAX = 2, /*!< Computes the maximun value. */ + MLUOP_REDUCE_DMAX = 2, /*!< Computes the maximum value. */ } mluOpReduceMode_t; /*! @@ -750,9 +750,7 @@ mluOpGetLibVersion(int *major, int *minor, int *patch); * @par API Dependency * - None. * @par Note - * - On MLU200 series: - * You cannot set MLUOP_ROUND_HALF_TO_EVEN for the rounding mode because the hardware does - * not support it. + * - None. * * @par Example * - None. @@ -804,7 +802,7 @@ mluOpGetQuantizeRoundMode(mluOpHandle_t handle, mluOpQuantizeRoundMode_t *round_ /*! * @brief Updates the specific atomics mode of MLU-OPS context information that is held by the * \b handle. This function should be called if you want to change the atomics mode that is - * used to cumulate the results.For detailed information, see "Cambricon CNDrv Developer Guide". + * used to cumulate the results. For detailed information, see "Cambricon CNDrv Developer Guide". * * @param[in] handle * Pointer to a Cambricon MLU-OPS context that is used to manage MLU devices and queues. For detailed @@ -1417,7 +1415,7 @@ mluOpSetTensorDescriptor( * The descriptor of the tensor. For detailed information, * see ::mluOpTensorDescriptor_t. * @param[in] pointer_mode - * The pointer mode of the input tensor. For detailed information, seee ::mluOpPointerMode_t. + * The pointer mode of the input tensor. For detailed information, see ::mluOpPointerMode_t. * @par Return * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM * @@ -1571,13 +1569,13 @@ mluOpSetTensorDescriptor_v2(mluOpTensorDescriptor_t desc, * convolution operation. If \b dimNb is set to 4, the output_space should be set in height and width * dimension. If \b dimNb is set to 5, the output_space should be set in depth, height and width dimension. * @param[in] sub_m - * An value that determine the algorithms for sparse convolution. If \b sub_m is set to 0, the + * A value that determine the algorithms for sparse convolution. If \b sub_m is set to 0, the * algorithms will be the default sparse convolution. If \b sub_m is set to 0, the algorithms will be the * submanifold sparse convolution. * @param[in] transpose - * An value that determines transpose. + * A value that determines transpose. * @param[in] inverse - * An value that determines inverse. + * A value that determines inverse. * * @par Return * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM, ::MLUOP_STATUS_EXECUTION_FAILED @@ -1979,7 +1977,7 @@ mluOpSetTensorDescriptorDim_v2(mluOpTensorDescriptor_t desc, int dimNb, const in * @param[in] desc * The descriptor of the tensor desc. For detailed information, see ::mluOpTensorDescriptor_t. * @param[in] onchip_dtype - * The on-chip data type of the tensor is used in the functon that supports fixed-point + * The on-chip data type of the tensor is used in the function that supports fixed-point * computing. * * @par Return @@ -2831,9 +2829,9 @@ mluOpInitTensorSetMemberDescriptor(mluOpTensorSetDescriptor_t tensorSetDesc, * - None. * * @par Note - * - If the member tensor is in floating-point data type, and you need to call + * - If the member tensor is in floating-point data type, and you need to call * this function. - * - If the member tensor is in fixed-point data type, and you need to call + * - If the member tensor is in fixed-point data type, and you need to call * this function. * - Before calling this function, * You need to call ::mluOpCreateTensorSetDescriptor to create @@ -3074,10 +3072,68 @@ mluOpLog(mluOpHandle_t handle, const mluOpTensorDescriptor_t y_desc, void *y); +// Group: Log +/*! + * @brief Returns a one-dimensional tensor of \b steps points logarithmically + * spaced with base \b base between \b base^start and \b base^end. + * + * @param[in] handle + * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices and + * queues in the log operation. For detailed information, see ::mluOpHandle_t. + * @param[in] start + * The starting value for the set of points. + * @param[in] end + * The ending value for the set of points. + * @param[in] steps + * Number of points to sample between \b start and \b end. + * @param[in] base + * Base of the logarithm function. + * @param[in] res_desc + * The descriptor of the tensor \b res. For detailed information, see + * ::mluOpTensorDescriptor_t. + * @param[out] res + * Pointer to the MLU memory that stores the output tensor \b res. + * + * @par Return + * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM + * + * @par Data Type + * - The supported data types of output tensor are as follows: + * - output tensor: half, float, int32 + * + * @par Data Layout + * - None. + * + * @par Scale Limitation + * - \b base cannot be NAN or infinity. + * - \b steps should be greater than or equal to 0. + * - \b steps should be less than or equal to the length of the output tensor \b res. + * + * @par API Dependency + * - None. + * + * @par Note + * - None. + * + * @par Example + * - None. + * + * @par Reference + * - https://github.com/pytorch/pytorch/blob/v2.1.0/aten/src/ATen/native/cuda/RangeFactories.cu#L123 + */ +mluOpStatus_t MLUOP_WIN_API +mluOpLogspace(mluOpHandle_t handle, + const float start, + const float end, + const int64_t steps, + const float base, + const mluOpTensorDescriptor_t res_desc, + void *res); + // Group: Carafe /*! * @brief Creates a descriptor pointed by \b carafe_desc for CARAFE upsampling forward and backward operations, - * and allocates memory holding the configuration parameters.The information is defined in ::mluOpCarafeDescriptor_t. + * and allocates memory holding the configuration parameters. The information is defined in ::mluOpCarafeDescriptor_t. * For more information about descriptor, see "Cambricon MLU-OPS User Guide". * * @param[in] carafe_desc @@ -3363,8 +3419,6 @@ mluOpCarafeForward(mluOpHandle_t handle, * @par Data Type * - The data types of \b input tensor, \b mask tensor, \b grad_output tensor, \b grad_input tensor, and \b grad_mask * tensor must be the same. - * - For MLU200 series, it is not recommended to use half data type for tensors due to the - * low precision. * - The supported data types of input and output tensors are as follows: * - input tensor: half, float * - mask tensor: half, float @@ -3797,7 +3851,7 @@ mluOpGetDynamicPointToVoxelForwardWorkspaceSize(mluOpHandle_t handle, * - The first dimension of \b voxel_num tensor must be equal to \b voxel_feats_desc[0]. * * @par API Dependency - * - Before calling this function to perform unique operater, you need to get + * - Before calling this function to perform unique operator, you need to get * the size of workspace by ::mluOpGetDynamicPointToVoxelForwardWorkspaceSize. * * @par Note @@ -4051,7 +4105,7 @@ mluOpGetGenerateProposalsV2WorkspaceSize_v2(mluOpHandle_t handle, * - None. * * @par Note - * - The operater does not support adaptive NMS. + * - The operator does not support adaptive NMS. * - The attribute `eta` should not be less than 1. * - ``nms_thresh`` should be more than 0. * - On MLU300 series and above: @@ -4381,7 +4435,7 @@ mluOpSetNmsDescriptor(mluOpNmsDescriptor_t nms_desc, * @param[in] nms_desc * The descriptor of the Nms function. For detailed information, see ::mluOpNmsDescriptor_t. * @param[in] boxes_desc - * The descriptor of the tensor \b boxes , including the information of dimension, data type, and + * The descriptor of the tensor \b boxes, including the information of dimension, data type, and * layout of input boxes. For detailed information, see ::mluOpTensorDescriptor_t. * @param[in] boxes * Pointer to the MLU memory that stores the input boxes tensor. @@ -4447,10 +4501,8 @@ mluOpSetNmsDescriptor(mluOpNmsDescriptor_t nms_desc, * @par Note * - When the input boxes is in Nms3D format ([boxes_num, 7] or [7, boxes_num]), * both of confidence_desc and confidence should be provided as null pointer. - * - In Nms3D mode, ::mluOpNms will get low precision on MLU200 platform. * - In Nms3D mode, when finding the point with minimum y and minimum x in convex-hull-graham, * it performs min-pooling operation. If the input data of pooling contains NaN: - * - On MLU200 series, the \b output value is the NaN. * - On MLU300 series, if the last value in the kernel of the pooling is NaN, the \b output value is NaN. * Otherwise, the \b output value is the minimum value after the last NaN. * @@ -4485,7 +4537,7 @@ mluOpNms(mluOpHandle_t handle, * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices and * queues in the Nms operation. For detailed information, see ::mluOpHandle_t. * @param[in] boxes_desc - * The descriptor of the tensor \b boxes , which contains dimension, data type, and + * The descriptor of the tensor \b boxes, which contains dimension, data type, and * data layout of input \b boxes. For detailed information, see ::mluOpTensorDescriptor_t. * @param[in] confidence_desc * The descriptor of the tensor \b confidence , which contains dimension, data type, and @@ -4574,7 +4626,7 @@ mluOpGetNmsWorkspaceSize(mluOpHandle_t handle, * @param[in] offset * The prior box center offset. * @param[in] clip - * A bool value whether to clip out-of-boundary boxes. + * A Boolean value whether to clip out-of-boundary boxes. * @param[in] min_max_aspect_ratios_order * If the value is set as True, the \b output prior box is in * the order of [min, max, aspect_ratios]; otherwise the order is @@ -4636,8 +4688,7 @@ mluOpGetNmsWorkspaceSize(mluOpHandle_t handle, * - The shape of \b output should be the same with \b var. * - The shape[0] of the \b output should be equal to the input height. * - The shape[1] of the \b output should be equal to the input width. - * - The shape[2] of the \b output and \b var must be less than 2100 - * in MLU200 series, and less than 2900 in MLU300 series. + * - The shape[2] of the \b output and \b var must be less than 2900 on MLU300 series. * - The shape[2] of \b output and \b var should be equal to * the product of shape[0] of \b min_sizes and \b aspect_ratios * plus shape[0] of \b max_sizes. @@ -4651,7 +4702,7 @@ mluOpGetNmsWorkspaceSize(mluOpHandle_t handle, * * @par Note * - The shape[2] of the \b output and \b var must be - * less than 2100 in MLU200 series, while less than 2900 in MLU300 series. + * less than 2900 on MLU300 series. * * @par Example * - None. @@ -4794,7 +4845,7 @@ mluOpPsRoiPoolForward(mluOpHandle_t handle, // Group: PsRoiPool /*! * @brief Computes the gradients of feature map \b bottom_grad based on the - * inputs \b top_grad , \b rois , and \b mapping_channel to perform the backpropagation + * inputs \b top_grad, \b rois, and \b mapping_channel to perform the backpropagation * of ::mluOpPsRoiPoolForward. * * @param[in] handle @@ -4952,7 +5003,7 @@ mluOpCreateRoiAlignForwardDescriptor(mluOpRoiAlignForwardDescriptor_t *desc); * If \b pool_mode is 1, the average pooling mode is used. * If \b pool_mode is 0, the maximum pooling mode is used. * @param[in] aligned - * A boolean value which determines whether to shift the boxes by 0.5 pixel. If \b aligned + * A Boolean value which determines whether to shift the boxes by 0.5 pixel. If \b aligned * is true, the boxes is shifted by 0.5. If \b aligned is false, the boxes is not shifted. * * @par Return @@ -5112,8 +5163,8 @@ mluOpDestroyRoiAlignForwardDescriptor(mluOpRoiAlignForwardDescriptor_t desc); * - This function should be called with ::mluOpSetRoiAlignForwardDescriptor_v2. * * @par Note - * - When \b input contains NaN. If \b pool_mode is maximum pooling_mode, \b output is positive - * saturation value on MLU200 series, and \b output gets more NaN than ieee754 on MLU300 series. + * - When \b input contains NaN, if \b pool_mode is maximum pooling_mode, \b output gets more NaN than + * IEEE 754 on MLU300 series. * * @par Example * - The example of ::mluOpRoiAlignForward_v2 is as follows: @@ -5181,11 +5232,11 @@ mluOpRoiAlignForward_v2(mluOpHandle_t handle, * @param[in] spatial_scale * The spatial scale of each ROI in the output. * @param[in] aligned - * A boolean value which determines whether to shift the ROI by 0.5 pixel. If the + * A Boolean value which determines whether to shift the ROI by 0.5 pixel. If the * value of \b aligned is set to true, the ROI is shifted by 0.5. If the value of \b aligned * is set to false, the ROI is not shifted. * @param[in] clockwise - * A boolean value which determines whether the rotation of ROI is clockwise. + * A Boolean value which determines whether the rotation of ROI is clockwise. * @param[out] output_desc * The descriptor of output, which contains dimension and the layout of output. * @param[out] output @@ -5291,11 +5342,11 @@ mluOpRoiAlignRotatedForward(mluOpHandle_t handle, * @param[in] spatial_scale * The spatial scale of each ROI in the output. * @param[in] aligned - * A boolean value which determines whether to shift the ROI by 0.5 pixel. + * A Boolean value which determines whether to shift the ROI by 0.5 pixel. * If the value of \b aligned is set to true, the ROI is shifted by 0.5. If the value * of \b aligned is set to false, the ROI is not shifted. * @param[in] clockwise - * A boolean value which determines whether the rotation of ROI is clockwise. + * A Boolean value which determines whether the rotation of ROI is clockwise. * @param[in] bottom_grad_desc * The descriptor of the tensor \b bottom_grad. * @param[out] bottom_grad @@ -5850,7 +5901,7 @@ mluOpSqrtBackward(mluOpHandle_t handle, * @param[in] NDim * An integer value which is the second dimension of coors. * @param[in] deterministic - * A bool value whether to invoke the non-deterministic + * A Boolean value whether to invoke the non-deterministic * version of hard-voxelization implementations. Currently, * non-deterministic mode is not supported. * @param[in] voxels_desc @@ -5940,11 +5991,11 @@ mluOpGetVoxelizationWorkspaceSize(mluOpHandle_t handle, * in a voxel. * @param[in] max_voxels * An integer value which is the maximum number of voxels this - * function create. + * function creates. * @param[in] NDim * An integer value which is the second dimension of coors. * @param[in] deterministic - * A bool value whether to invoke the non-deterministic + * A Boolean value whether to invoke the non-deterministic * version of hard-voxelization implementations. Currently, * non-deterministic mode is not supported. * @param[in] workspace @@ -6093,7 +6144,7 @@ mluOpVoxelization(mluOpHandle_t handle, * @par Scale Limitation * - The first dimension of x tensor, img_size tensor, boxes tensor and scores * tensor must be the same size. - * - The second dimension (the channel dimension) of x tensor , C should be equal to S * (5 + + * - The second dimension (the channel dimension) of x tensor, C should be equal to S * (5 + * class_num) if \b iou_aware is false, otherwise C should be equal to S * (6 + class_num), * the value S is equal to the anchors tensor size divided by 2. * - The first dimension of anchors tensor should be larger than 0. @@ -6104,8 +6155,7 @@ mluOpVoxelization(mluOpHandle_t handle, * - The third dimension of scores tensor must be equal to \b class_num. * - The fourth dimension of boxes tensor and scores tensor must be equal to the * multiplication result of the third dimension and the fourth dimension of input x tensor. - * - The \b class_num should be larger than 0. On MLU200, the value cannot be - * greater than 1534. On MLU300, the value cannot be greater than 2558. + * - The \b class_num should be larger than 0. On MLU300 series, the value cannot be greater than 2558. * * @par API Dependency * - None. @@ -6213,7 +6263,6 @@ mluOpYoloBox(mluOpHandle_t handle, * - None. * * @par Note - * - The function does not support MLU200 series. * - You need to set the initial value for the output \b pos_memo before calling the funtion, and initialize it to a * negative number. * @@ -6255,7 +6304,7 @@ mluOpVoxelPoolingForward(mluOpHandle_t handle, * IOU (Intersection Over Union) or IOF (Intersection Over Foreground). * The integer 0 represents IOU and 1 represents IOF. * @param[in] aligned - * A boolean value. If it is false, then calculate the IOU[i][j] + * A Boolean value. If it is false, then calculate the IOU[i][j] * or IOF[i][j] between the row i of \b bbox1 and the row j of \b bbox2, * otherwise calculate the IOU[i] or IOFs[i] between the row i of \b bbox1 * and the row i of \b bbox2. Significantly, the numbers of rows of \b bbox1 @@ -6312,8 +6361,6 @@ mluOpVoxelPoolingForward(mluOpHandle_t handle, * - When finding the point with minimum y and minimum x in convex-hull-graham, * BoxIouRotated performs min-pooling operation. If the input data of pooling * contains NaN: - * - On MLU200 series: - * - The \b output value is the NaN. * - On MLU300 series: * - If the last value in the kernel of the pooling is NaN, the \b output * value is NaN. Otherwise, the \b output value is the minimum value after @@ -6409,7 +6456,7 @@ mluOpGetNmsRotatedWorkspaceSize(mluOpHandle_t handle, const mluOpTensorDescripto * * @par Data Type * - By the order of \b boxes - \b scores - \b output, the supported data types of - * \b boxes , \b scores , and \b output tensors are as follows: + * \b boxes, \b scores, and \b output tensors are as follows: * - float - float - int32 * * @par Scale Limitation @@ -6459,9 +6506,9 @@ mluOpNmsRotated(mluOpHandle_t handle, * An integer value which decides to return a result IOU or IOF. * The integer 0 represents IOU and 1 represents IOF. * @param[in] aligned - * A boolean value. If it is false, this operation calculates the IOUs[i][j] or IOFs[i][j] between + * A Boolean value. If it is false, this operation calculates the IOUs[i][j] or IOFs[i][j] between * the row i of \b bbox1 and the row j of \b bbox2, otherwise the IOU[i] or IOF[i] between - * the row i of \b bbox1 and the row i of \b bbox2 are calculated. The number of row of \b bbox1 + * the row i of \b bbox1 and the row i of \b bbox2 are calculated. The number of rows of \b bbox1 * and \b bbox2 must be equal if \b aligned is true. * @param[in] offset * An integer value determines whether to increase the length and the width of the bounding-box by 0 or 1 @@ -6617,12 +6664,12 @@ mluOpBboxOverlaps(mluOpHandle_t handle, * - output tensor: \p MLUOP_LAYOUT_ARRAY * * @par Scale Limitation - * - The dimension of \b features , \b indices , \b weights , and \b output + * - The dimension of \b features, \b indices, \b weights, and \b output * should be equal to 3. - * - The shape[0] of \b features , \b indices , \b weights , and \b output + * - The shape[0] of \b features, \b indices, \b weights, and \b output * should be the same. * - The shape[1] of \b features and \b output should be the same. - * - The shape[1] of \b indices , \b weights , and the shape[2] of \b output + * - The shape[1] of \b indices, \b weights, and the shape[2] of \b output * should be the same. * - The shape[2] of \b indices and \b weights should be equal to 3. * @@ -6632,8 +6679,6 @@ mluOpBboxOverlaps(mluOpHandle_t handle, * @par Note * - The value of \b indices must be in the range of [0, M-1], otherwise the output result * is meaningless and the corresponding output will be set to 0. - * - In MLU200 series, the maximum value in the \b indices should be less than - * 2^23, otherwise the output result is not guaranteed to be correct. * * @par Example * - None. @@ -6655,7 +6700,7 @@ mluOpThreeInterpolateForward(mluOpHandle_t handle, // Group: ThreeInterpolate /*! * @brief Computes the gradients of feature map \b grad_features based on the - * inputs \b grad_output , \b indices , and \b weights to perform the backpropagation + * inputs \b grad_output, \b indices, and \b weights to perform the backpropagation * of ::mluOpThreeInterpolateForward. * * @param[in] handle @@ -6721,8 +6766,6 @@ mluOpThreeInterpolateForward(mluOpHandle_t handle, * @par Note * - The value of \b indices must be in the range of [0, M-1], otherwise the output result * is meaningless and the corresponding output will be set to 0. - * - In MLU270 and MLU290, the maximum value in the \b indices should be less than - * 2^23, otherwise the output result is not guaranteed to be correct. * * @par Example * - None. @@ -6880,7 +6923,7 @@ mluOpBallQuery(mluOpHandle_t handle, * ::MLUOP_STATUS_EXECUTION_FAILED * * @par Data Type - * - The supported data types of input tensors \b input, \b target, \b weight , and output + * - The supported data types of input tensors \b input, \b target, \b weight, and output * tensor \b output are as follows: * - input: half, float * - target: int32 @@ -6994,7 +7037,7 @@ mluOpFocalLossSigmoidForward(mluOpHandle_t handle, * ::MLUOP_STATUS_EXECUTION_FAILED * * @par Data Type - * - The supported data types of input tensor \b input, \b target, \b weight , and output + * - The supported data types of input tensor \b input, \b target, \b weight, and output * tensor \b output are as follows: * - input: float, half * - target: int32 @@ -7170,7 +7213,7 @@ mluOpGetMaskedIm2colForwardWorkspaceSize(mluOpHandle_t handle, * - data_col tensor: half, float. * * @par Data Layout - * - The supported data layouts of \b feature , \b mask_h_idx , \b mask_w_idx , and \b data_col are as follows: + * - The supported data layouts of \b feature, \b mask_h_idx, \b mask_w_idx, and \b data_col are as follows: * - feature tensor: \p MLUOP_LAYOUT_NCHW. * - mask_h_idx tensor: \p MLUOP_LAYOUT_ARRAY. * - mask_w_idx tensor: \p MLUOP_LAYOUT_ARRAY. @@ -7291,8 +7334,8 @@ mluOpMaskedIm2colForward(mluOpHandle_t handle, * tensor must be the same size and equal to \b samples . * - The second dimension of \b grad_input tensor and \b dispatch tensor must be equal to \b hidden . * - The first dimension of \b dispatch tensor must be equal to the multiplication result of - * the \b capacity and \b num_experts . - * - The value of the input parameters \b samples , \b capacity , \b hidden , and \b num_experts + * the \b capacity and \b num_experts. + * - The value of the input parameters \b samples, \b capacity , \b hidden , and \b num_experts * must be greater than or equal to 0. * - The value range of the input parameter \b indices tensor must be greater than or equal to 0 and less than * \b num_experts. @@ -7304,7 +7347,7 @@ mluOpMaskedIm2colForward(mluOpHandle_t handle, * * @par Note * - This function is only supported on MLU300 series or above platforms. - * - The parameter \b samples , \b capacity , \b hidden , and \b num_experts should not be negative. + * - The parameter \b samples, \b capacity , \b hidden , and \b num_experts should not be negative. * * @par Example * - The example of the function is as follows: @@ -7420,7 +7463,6 @@ mluOpMoeDispatchBackwardData(mluOpHandle_t handle, * - The input \b sampling_loc that contains NaN or infinity is not supported. * - The \b value, \b sampling_loc, \b with attn_weight and \b grad_output contain NaN or infinity are not * supported on series higher than MLU300 series currently. - * - The function does not support MLU200 series. * * @par Example * - None. @@ -7475,7 +7517,7 @@ mluOpMsDeformAttnBackward(mluOpHandle_t handle, * The descriptor of the tensor \b ans_grad containing dimension, data type, and data layout. * For detailed information, see ::mluOpTensorDescriptor_t. * @param[in] overwrite_ans_grad - * A boolean value indicating whether to overwrite \b ans_grad. + * A Boolean value indicating whether to overwrite \b ans_grad. * @param[out] workspace_size * Pointer to the MLU memory that stores the returned size of the extra workspace in bytes. * @@ -7546,7 +7588,7 @@ mluOpGetMutualInformationBackwardWorkspaceSize(mluOpHandle_t handle, * @param[in] ans_grad * Pointer to the MLU memory that stores the tensor \b ans_grad. * @param[in] overwrite_ans_grad - * A boolean value indicating whether to overwrite \b ans_grad. + * A Boolean value indicating whether to overwrite \b ans_grad. * @param[in] workspace * Pointer to the MLU memory as an extra workspace for the mutual_information_backward operation. * For more information about the workspace, see "Cambricon MLU-OPS User Guide". @@ -7897,7 +7939,7 @@ mluOpGetRoiAwarePool3dForwardWorkspaceSize(mluOpHandle_t handle, * @brief Returns \b argmax, \b pts_idx_of_voxels and \b pooled_features calculated by * this operator. * - * The operator determine the points in each box based on input coordinates. The collection + * The operator determines the points in each box based on input coordinates. The collection * of points in boxes are named as voxels and recorded as \b pts_idx_of_voxels. The operator * also performs max pooling or average pooling on the voxels and results in \b argmax * and \b pooled_features. @@ -7994,7 +8036,6 @@ mluOpGetRoiAwarePool3dForwardWorkspaceSize(mluOpHandle_t handle, * @par Note * - The inputs \b rois and \b pts with NaN or infinity are not supported on MLU300 series. * - The input \b pts_feature with NaN are not supported on MLU300 series. - * - The function does not support MLU200 series. * * @par Example * - None. @@ -8032,7 +8073,7 @@ mluOpRoiawarePool3dForward(mluOpHandle_t handle, * @brief Returns \b argmax, \b pts_idx_of_voxels and \b pooled_features calculated by * this operator. * - * The operator determine the points in each box based on input coordinates. The collection + * The operator determines the points in each box based on input coordinates. The collection * of points in boxes are named as voxels and recorded as \b pts_idx_of_voxels. The operator * also performs max pooling or average pooling on the voxels and results in \b argmax * and \b pooled_features. @@ -8125,7 +8166,6 @@ mluOpRoiawarePool3dForward(mluOpHandle_t handle, * @par Note * - The inputs \b rois and \b pts with NaN or infinity are not supported on MLU300 series. * - The input \b pts_feature with NaN are not supported on MLU300 series. - * - The function does not support MLU200 series. * * @par Example * - None. @@ -8229,7 +8269,7 @@ mluOpRoiAwarePool3dForward(mluOpHandle_t handle, * - None. * * @par Note - * - The function does not support MLU200 series. + * - None. * * @par Example * - None. @@ -8322,7 +8362,7 @@ mluOpRoiawarePool3dBackward(mluOpHandle_t handle, * - None. * * @par Note - * - The function does not support MLU200 series. + * - None. * * @par Example * - None. @@ -8350,7 +8390,7 @@ mluOpRoiAwarePool3dBackward(mluOpHandle_t handle, // Group: Psamask /*! - * @brief Moves the \b x tensor to \b y tensor according to \b h_mask , \b w_mask , and \b psa_type. + * @brief Moves the \b x tensor to \b y tensor according to \b h_mask, \b w_mask, and \b psa_type. * * * @param[in] handle @@ -8394,11 +8434,6 @@ mluOpRoiAwarePool3dBackward(mluOpHandle_t handle, * - If the shape of \b x is set to [N, H, W, C], the size of C dimension should be \b h_mask * \b * w_mask. * - If the shape of \b y is set to [N, H, W, C], the size of C dimension should be H * W. - * - On MLU200 series: - * - When psa_type is COLLECT, the size of \b x channels ci and \b y channels co should be - * satisfied: ci + co <= 6144. - * - When psa_type is DISTRIBUTE, the size of \b x channels ci and \b y channels co should be - * satisfied: ci + 2 * co <= 6144. * - On MLU300 series: * - When psa_type is COLLECT, the size of \b x channels ci and \b y channels co should be * satisfied: ci + co <= 10240. @@ -8430,7 +8465,7 @@ mluOpPsamaskForward(mluOpHandle_t handle, // Group: Psamask /*! * @brief Computes the gradients of input tensor \b dx with the gradients of output tensor \b dy - * according to \b h_mask , \b w_mask , and \b psa_type. + * according to \b h_mask, \b w_mask, and \b psa_type. * * @param[in] handle * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices and @@ -8473,11 +8508,6 @@ mluOpPsamaskForward(mluOpHandle_t handle, * - If the shape of \b dx is set to [N, H, W, C], the size of C dimension should be \b h_mask * \b * w_mask . * - If the shape of \b dy is set to [N, H, W, C], the size of C dimension should be H * W. - * - On MLU200 series: - * - When psa_type is COLLECT, the size of \b dx channels ci and \b dy channels co should be - * satisfied: ci + co <= 6144. - * - When psa_type is DISTRIBUTE, the size of \b dx channels ci and \b dy channels co should be - * satisfied: ci + 2 * co <= 6144. * - On MLU300 series: * - When psa_type is COLLECT, the size of \b dx channels ci and \b dy channels co should be * satisfied: ci + co <= 10240. @@ -8509,7 +8539,7 @@ mluOpPsamaskBackward(mluOpHandle_t handle, // Group: SparseConv /*! * @brief Computes the get_indice_paris operation, then returns the results in the output - * tensor \b out_indices , \b indice_pairs and \b ind, ice_num. + * tensor \b out_indices, \b indice_pairs and \b ind, ice_num. * * @param[in] handle * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices and queues in the @@ -8553,7 +8583,7 @@ mluOpPsamaskBackward(mluOpHandle_t handle, * @par Data Type * - This function supports the combinations of the following data types for * input tensor \b indices and output tensor \b out_indices, \b indice_pairs and \b indice_num. - * - \b indices , \b out_indices , \b indice_pairs , and \b indice_num data type: int32, int32, int32, int32 + * - \b indices, \b out_indices, \b indice_pairs, and \b indice_num data type: int32, int32, int32, int32 * * @par Data Layout * - None. @@ -8611,8 +8641,8 @@ mluOpGetIndicePairs(mluOpHandle_t handle, * to optimize the get_indice_pairs operation. * * The size of extra workspace is based on the given information of the get_indice_pairs - * operation, including the input tensor descriptor \b sparse_conv_desc , and \b indices_desc, output - * tensor descriptor \b out_indices_desc , \b indice_pairs_desc , and \b indice_num_desc. + * operation, including the input tensor descriptor \b sparse_conv_desc, and \b indices_desc, output + * tensor descriptor \b out_indices_desc, \b indice_pairs_desc, and \b indice_num_desc. * * @param[in] handle * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices and queues in the @@ -8650,7 +8680,7 @@ mluOpGetIndicePairs(mluOpHandle_t handle, * * @par API Dependency * - You need to call ::mluOpCreateTensorDescriptor and ::mluOpSetTensorDescriptor to create and set - * tensor descriptors \b indices_desc , \b out_indices_desc , \b indice_pairs_desc , and \b indice_num_desc before + * tensor descriptors \b indices_desc, \b out_indices_desc, \b indice_pairs_desc, and \b indice_num_desc before * calling this function. * - You need to call ::mluOpCreateSparseConvolutionDescriptor to create a descriptor, * and call ::mluOpSetSparseConvolutionDescriptor to set the tensor information for @@ -8963,7 +8993,7 @@ mluOpCreateAdamWDescriptor(mluOpAdamWDescriptor_t *adamw_desc); /*! * @brief Initializes the descriptor \b adamw_desc that was previously created with * ::mluOpCreateAdamWDescriptor function, and sets AdamW information - * to the descriptor \b adamw_desc. The information includes \b weight_decay , \b grad_scale + * to the descriptor \b adamw_desc. The information includes \b weight_decay, \b grad_scale * and \b use_nesterov for AdamW operation. * * @param[in] adamw_desc @@ -9147,7 +9177,7 @@ mluOpDeformRoiPoolForward(const mluOpHandle_t handle, // Group: DeformRoiPool /*! * @brief Computes the gradient of input \b grad_input and the gradient of offset \b grad_offset - * based on the gradient of output \b grad_output , input \b input , ROI \b rois , and offset \b offset. + * based on the gradient of output \b grad_output, input \b input, ROI \b rois, and offset \b offset. * * @param[in] handle * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices and queues in @@ -9314,11 +9344,11 @@ mluOpDeformRoiPoolBackward(const mluOpHandle_t handle, * - boxes tensor: half, float * - output tensor: half, float * - argmax_idx tensor: int32_t - * Note that the data type of \b input , \b boxes , and \b output + * Note that the data type of \b input, \b boxes, and \b output * must be the same. * * @par Data Layout - * - The supported data layout of \b input , \b boxes , \b output , and + * - The supported data layout of \b input, \b boxes, \b output, and * \b argmax_idx are as follows: * - input tensor: \p MLUOP_LAYOUT_NHWC * - boxes tensor: \p MLUOP_LAYOUT_ARRAY @@ -9414,7 +9444,7 @@ mluOpBorderAlignForward(mluOpHandle_t handle, // Group: BorderAlign /*! * @brief Computes the gradient of the input tensor of ::mluOpBorderAlignForward - * according to the output gradient \b grad_output , the maximum pooling index \b + * according to the output gradient \b grad_output, the maximum pooling index \b * argmax_idx and bounding boxes \b boxes . * * @param[in] handle @@ -9435,14 +9465,14 @@ mluOpBorderAlignForward(mluOpHandle_t handle, * Pointer to the MLU memory that stores \b boxes tensors. The shape of \b boxes is * [N, H * W, 4]. * @param[in] argmax_idx_desc - * Descriptor of \b argmax_idx , containing dimension and the layout of \b argmax_idx . + * Descriptor of \b argmax_idx, containing dimension and the layout of \b argmax_idx . * @param[in] argmax_idx * Pointer to the MLU memory that stores the \b argmax_idx tensor, which is the result * of max pooling index. The shape of argmax_idx is [N, K, 4, C]. * @param[in] pool_size * Number of positions sampled over the boxes borders. * @param[in] grad_input_desc - * Descriptor of \b grad_input , containing dimension and the layout of output. + * Descriptor of \b grad_input, containing dimension and the layout of output. * @param[out] grad_input * Pointer to the MLU memory that stores the gradient of the input * tensor of ::mluOpBorderAlignForward. The shape of \b grad_input is [N, H, W, 4C], @@ -9459,11 +9489,11 @@ mluOpBorderAlignForward(mluOpHandle_t handle, * - boxes tensor: half, float * - argmax_idx tensor: int32_t * - grad_input tensor: half, float - * Note that the data type of \b grad_output , \b boxes , and \b grad_input + * Note that the data type of \b grad_output, \b boxes, and \b grad_input * must be the same. * * @par Data Layout - * - The supported data layout of \b grad_output , \b boxes , \b argmax_idx and, + * - The supported data layout of \b grad_output, \b boxes, \b argmax_idx and, * \b grad_input are as follows: * - grad_output tensor: \p MLUOP_LAYOUT_NHWC * - boxes tensor: \p MLUOP_LAYOUT_ARRAY @@ -9471,7 +9501,7 @@ mluOpBorderAlignForward(mluOpHandle_t handle, * - grad_input tensor: \p MLUOP_LAYOUT_NHWC * * @par Scale Limitation - * - The \b grad_output , \b argmax_idx and \b grad_input are 4D tensor. + * - The \b grad_output, \b argmax_idx and \b grad_input are 4D tensor. * - The \b boxes is 3D tensor. * - The dims[3] of \b boxes should be equal to 4. * - The shape of \b grad_output and \b argmax_idx must be the same. @@ -9561,9 +9591,9 @@ mluOpBorderAlignBackward(mluOpHandle_t handle, * * The size of extra workspace is based on the given information of the indice * convolution backward data operation, including the input descriptor - * \b input_grad_desc, the filter descriptor \b filter_desc , the indice pairs - * descriptor \b indice_pairs_desc , the output descriptor \b indice_pairs_desc , - * the array \b indice_num , and the scaler \b inverse. For more information + * \b input_grad_desc, the filter descriptor \b filter_desc, the indice pairs + * descriptor \b indice_pairs_desc, the output descriptor \b indice_pairs_desc, + * the array \b indice_num, and the scaler \b inverse. For more information * about the workspace, see "Cambricon MLU-OPS User Guide". * * @param[in] handle @@ -9605,7 +9635,7 @@ mluOpBorderAlignBackward(mluOpHandle_t handle, * @par API Dependency * - This function must be called before ::mluOpIndiceConvolutionBackwardData. * - ::mluOpCreateTensorDescriptor and ::mluOpSetTensorDescriptor - * create and set the tensor descriptor \b output_grad_desc , \b filters_desc , + * create and set the tensor descriptor \b output_grad_desc, \b filters_desc, * \b indice_pairs_desc and \b input_grad_desc before this function is called. * * @par Note @@ -9631,7 +9661,7 @@ mluOpGetIndiceConvolutionBackwardDataWorkspaceSize(mluOpHandle_t handle, /*! * @brief Performs the back propagation of an indice convolution operation to * compute the gradient of input \b input_grad based on the gradient of response - * \b output_grad , the filter tensor \b filter , the indice tensor \b indice_pairs , + * \b output_grad, the filter tensor \b filter, the indice tensor \b indice_pairs, * and helper parameters: array \b indice_num, scaler \b inverse and \b sub_m. * * The tensors \b input_grad and \b output_grad are reordered from origin input @@ -9773,8 +9803,8 @@ mluOpIndiceConvolutionBackwardData(mluOpHandle_t handle, * to optimize the indice_convolution_backward_filter operation. * * The size of extra workspace is based on the given information of the indice_convolution_backward_filter - * operation, including the input tensor descriptor \b features_desc , \b output_grad_desc , and \b indice_pairs_desc , - * output tensor descriptor \b filters_grad_desc , and the array \b indice_num[]. + * operation, including the input tensor descriptor \b features_desc, \b output_grad_desc, and \b indice_pairs_desc, + * output tensor descriptor \b filters_grad_desc, and the array \b indice_num[]. * * @param[in] handle * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices and queues in the @@ -9815,7 +9845,7 @@ mluOpIndiceConvolutionBackwardData(mluOpHandle_t handle, * * @par API Dependency * - You need to call ::mluOpCreateTensorDescriptor and ::mluOpSetTensorDescriptor to create and set - * tensor descriptors \b features_desc , \b output_grad_desc , \b indice_pairs_desc , and \b filters_grad_desc before + * tensor descriptors \b features_desc, \b output_grad_desc, \b indice_pairs_desc, and \b filters_grad_desc before * calling this function. * - The allocated extra workspace should be passed to ::mluOpIndiceConvolutionBackwardFilter to * perform the indice_convolution_backward_filter operation. @@ -9887,9 +9917,9 @@ mluOpGetIndiceConvolutionBackwardFilterWorkspaceSize(mluOpHandle_t handle, * * @par Data Type * - This function supports the combinations of the following data types for - * input tensor \b features , \b output_grad , \b indice_pairs_num , and output tensor \b filters_grad. - * - \b features , \b output_grad , \b indice_pairs , \b filters_grad data type: half, half, int32, half - * - \b features , \b output_grad , \b indice_pairs , \b filters_grad data type: float, float, int32, float + * input tensor \b features, \b output_grad, \b indice_pairs_num, and output tensor \b filters_grad. + * - \b features, \b output_grad, \b indice_pairs, \b filters_grad data type: half, half, int32, half + * - \b features, \b output_grad, \b indice_pairs, \b filters_grad data type: float, float, int32, float * * @par Data Layout * - None. @@ -10077,7 +10107,7 @@ mluOpGetRoiPointPool3dWorkspaceSize(mluOpHandle_t handle, * * @par Data Type * - The supported data types for input and output are as follows: - * Note that the data type of \b points , \b point_features , \b boxes3d , and + * Note that the data type of \b points, \b point_features, \b boxes3d , and * \b pooled_features must be the same. * - points: half, float * - point_features: half, float @@ -10218,9 +10248,9 @@ mluOpGetThreeNNForwardWorkspaceSize(const mluOpHandle_t handle, * @par Scale Limitation * - The shape of \b unknown, \b dist2 and \b idx should be [b, n, 3]. * - The shape of \b known should be [b, m, 3]. - * - The shape of \b unknown , \b dist2 , \b idx , and \b known dims[0](b) should be equal. - * - The shape of \b unknown , \b dist2 , \b idx , and \b known dims[2](3) should be equal to 3. - * - The shape of \b unknown , \b dist2 , \b idx , and \b known dims[1](n) should be equal and larger + * - The shape of \b unknown , \b dist2 , \b idx, and \b known dims[0](b) should be equal. + * - The shape of \b unknown , \b dist2 , \b idx, and \b known dims[2](3) should be equal to 3. + * - The shape of \b unknown , \b dist2 , \b idx, and \b known dims[1](n) should be equal and larger * than 0. * * @par API Dependency @@ -10374,9 +10404,9 @@ mluOpGetIndiceConvolutionForwardWorkspaceSize(mluOpHandle_t handle, * * @par Data Type * - This function supports the combination of the following data types: - * - input tensor \b features , \b filters , \b indice_pairs , and output tensor \b features_out: half, half, int32, + * - input tensor \b features, \b filters, \b indice_pairs, and output tensor \b features_out: half, half, int32, * half - * - input tensor \b features , \b filters , \b indice_pairs , and output tensor \b features_out: float, float, int32, + * - input tensor \b features, \b filters, \b indice_pairs, and output tensor \b features_out: float, float, int32, * float * - The supported data type of array \b indice_num , scalar \b inverse , and \b sub_m is int64. * @@ -10489,8 +10519,8 @@ mluOpIndiceConvolutionForward(mluOpHandle_t handle, * ::MLUOP_STATUS_NOT_SUPPORTED, ::MLUOP_STATUS_EXECUTION_FAILED * * @par Data Type - * - This function supports the following data types for input tensors \b gates , \b indices , - * \b locations , \b input , and \b dispatch. + * - This function supports the following data types for input tensors \b gates, \b indices, + * \b locations, \b input , and \b dispatch. * - gates tensor: float * - indices tensor: int32 * - locations tensor: int32 @@ -10505,7 +10535,7 @@ mluOpIndiceConvolutionForward(mluOpHandle_t handle, * tensor must be the same size and equal to \b samples. * - The second dimension of \b input tensor and \b dispatch tensor must be equal to \b hidden . * - The first dimension of \b dispatch tensor must be equal to the multiplication result of - * the \b capacity and \b num_experts . + * the \b capacity and \b num_experts. * - The samples must be less than or equal to the multiplication result of the \b capacity and \b * num_experts. * - The values of indices must be between 0 and (num_experts-1) . @@ -10516,7 +10546,7 @@ mluOpIndiceConvolutionForward(mluOpHandle_t handle, * * @par Note * - This function is only supported on MLU300 series or above platforms. - * - The parameters \b samples , \b capacity , \b hidden , and \b num_experts should not be negative. + * - The parameters \b samples, \b capacity , \b hidden , and \b num_experts should not be negative. * * @par Example * - The example of the function is as follows: @@ -10637,7 +10667,7 @@ mluOpGetMoeDispatchBackwardGateWorkspaceSize(mluOpHandle_t handle, * @param[in] workspace_size * The size of the extra workspace in bytes. * @param[in] grad_gates_desc - * The descriptor of the tensor \b grad_gates , which contains dimension, data type, and data layout. + * The descriptor of the tensor \b grad_gates, which contains dimension, data type, and data layout. * For detailed information, see ::mluOpTensorDescriptor_t. * @param[out] grad_gates * Pointer to the MLU memory that stores the \b grad_gates tensor. @@ -10663,7 +10693,7 @@ mluOpGetMoeDispatchBackwardGateWorkspaceSize(mluOpHandle_t handle, * - The second dimension of \b input tensor and \b dispatch tensor must be equal to \b hidden. * - The first dimension of \b dispatch tensor must be equal to the multiplication result of * the \b capacity and \b num_experts. - * - The value of the input parameters \b samples , \b capacity , \b hidden , and \b num_experts + * - The value of the input parameters \b samples, \b capacity , \b hidden , and \b num_experts * must be greater than or equal to 0. * - The value range of the input parameter \b indices tensor must be greater than or equal to 0 and less than * \b num_experts. @@ -10676,7 +10706,7 @@ mluOpGetMoeDispatchBackwardGateWorkspaceSize(mluOpHandle_t handle, * * @par Note * - This function is only supported on MLU300 series or above platforms. - * - The parameters \b samples , \b capacity , \b hidden , and \b num_experts should not be negative. + * - The parameters \b samples, \b capacity , \b hidden , and \b num_experts should not be negative. * * @par Example * - The example of the operation is as follows: @@ -10800,7 +10830,7 @@ mluOpPointsInBoxes(mluOpHandle_t handle, * @param[in] sampling_ratio * The number of sampling points in the grid used to compute the output. * @param[in] aligned - * A boolean value which determines whether to shift the boxes by 0.5 pixel. + * A Boolean value which determines whether to shift the boxes by 0.5 pixel. * @param[in] grads_image_desc * The descriptor of the tensor \b grads_image of the original images. * @param[out] grads_image @@ -10912,7 +10942,7 @@ mluOpRoiAlignBackward(mluOpHandle_t handle, * @param[in] sampling_ratio * The number of sampling points in the grid used to compute the output. * @param[in] aligned - * A boolean value which determines whether to shift the boxes by 0.5 pixel. If the value + * A Boolean value which determines whether to shift the boxes by 0.5 pixel. If the value * of \b aligned is set to true, the boxes are shifted by 0.5. If the value of \b aligned is set * to false, the boxes are not shifted. * @param[in] pool_mode @@ -11498,20 +11528,15 @@ mluOpDiffIouRotatedSortVerticesForward(mluOpHandle_t handle, * - \b Spatial_scale should be in the range of (0, 1]. * - \b Output consists of [rois_num, pooled_h, pooled_w, channels]. In the dimensions of the h and w of the input * and the output, (\b x2 - \b x1) * (\b y2 - \b y1) * \b spatial_scale * \b spatial_scale / (\b pooled_h * \b - * pooled_w) < (nram_limitation / 32). Nram_limitation means the limitation of the nram. When the supported MLU - * platform is 200, the nram_limitation is (98304 - 4 * \b channels) / 2. When the supported MLU platform is 300, - * the nram_limitation is (163804 - 4 * \b channels) / 2. \b pooled_h means height of output. \b pooled_w means width - * of output. + * pooled_w) < (nram_limitation / 32). Nram_limitation means the limitation of the nram. On MLU300 series, + * the nram_limitation is (163804 - 4 * \b channels) / 2. \b pooled_h means height of output. + * \b pooled_w means width of output. * * @par API Dependency * - None * * @par Note - * - It is not recommended to set the data type of input tensor, ROIS tensor and output tensors - * that may cause the low accuracy on MLU200 series. * - When the input data or parameter contains NaN or infinity: - * - On MLU200 series, the \b output is the positive saturation value. - * The \b argmax is the index of the last NaN in the kernel of the pooling. * - On MLU300 series, if the last value in the kernel of the pooling is NaN, \b argmax is * the index of the last value, \b output is the last value, as shown in example 2 below. * Otherwise, \b argmax is the index of the maximum value after the last NaN, @@ -13533,10 +13558,6 @@ mluOpGetDCNForwardWorkspaceSize(mluOpHandle_t handle, * - The off-chip data type of \p input, \p offset, \p mask, \p filter, \p bias, and \p output must be the same. * - The supported off-chip data types of the input tensor and output tensor are as follows: * - input, offset, mask, filter, bias, output: half, float. - * - This function supports any combinations of the following on-chip data types for input tensor - * \p input and \p filter on MLU200 series. - * - \p input onchip data type: int16, int31. - * - \p filter onchip data type: int16, int31. * - \p input offchip data type can be combined with any supported onchip data types. * - \p filter offchip data type can be combined with any supported onchip data types. * - This function also supports floating-point computation on MLU300 series or above. @@ -13781,10 +13802,6 @@ mluOpGetDCNBackwardWeightWorkspaceSize(mluOpHandle_t handle, * and \p grad_bias must be the same. * - The supported off-chip data types of the input tensor and output tensor are as follows: * - input, offset, mask, grad_output, grad_filter, grad_bias, grad_mask: half, float. - * - This function supports any combinations of the following on-chip data types for input tensor - * \p grad_output and \p input on MLU200 series. - * - \p grad_output on-chip data type: int16, int31. - * - \p filter on-chip data type: int16, int31. * - \p grad_output off-chip data type can be combined with any supported on-chip data types. * - \p input off-chip data type can be combined with any supported on-chip data types. * - This function also supports floating-point computation on MLU300 series or above. To perform @@ -14027,9 +14044,6 @@ mluOpGetDCNBakcwardDataWorkspaceSize(mluOpHandle_t handle, * - The supported offchip data types of the input tensor and output tensor are as follows: * - input, offset, mask, filter, grad_output, grad_input, grad_offset, grad_mask: half, float. * - This function supports any combinations of the following onchip data types for input tensor - * \p grad_output and \p filter on MLU200 series. - * - \p grad_output onchip data type: int16, int31. - * - \p filter onchip data type: int16, int31. * - \p grad_output offchip data type can be combined with any supported onchip data types. * - \p filter offchip data type can be combined with any supported onchip data types. * - This function also supports floating-point computation on MLU300 series or above. To perform @@ -14328,6 +14342,24 @@ mluOpSetFFTReserveArea(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan, void *rese * output is n/2 + 1 non-redundant complex numbers. This requires a padding of input array. * - For in-place N-D real-to-complex FFTs, extra padding of the real-data array on the innermost * dimension is necessary to accommodate the size of the complex-data output. + * - For 2D FFTs, cases with strides that meet the following conditions have + * better performance: + * - real-to-complex: + * - n[0] < 200, n[0] == inembed[0], onembed[0] == n[0] + * - n[1] < 200, n[1] == inembed[1], onembed[1] == n[1]/2+1 + * - input: dims[batch, n0, n1], strides[1, batch*n1, batch] + * - output: dims[batch, n0, n1/2+1], strides[1, batch*(n1/2+1), batch] + * - complex-to-complex: + * - n[0] < 200, n[0] == inembed[0], onembed[0] == n[0] + * - n[1] < 200, n[1] == inembed[1], onembed[1] == n[1] + * - input: dims[batch, n0, n1], strides[1, batch*n1, batch] + * - output: dims[batch, n0, n1], strides[1, batch*n1, batch] + * - complex-to-real: + * - n[0] < 200, n[0] == inembed[0], onembed[0] == n[0] + * - n[1] < 200, n[1]/2+1 == inembed[1], onembed[1] == n[1] + * - input: dims[batch, n0, n1/2+1], strides[1, batch*(n1/2+1), batch] + * - output: dims[batch, n0, n1], strides[1, batch*n1, batch] + * * - When \p input contains NaN or infinity and the input onchip data type of FFT is not quantized * data type, the output is computed through the FFT formula with computation rules of NaN or * infinity based on IEEE 754. diff --git a/samples/mlu-ops/fault_sample/fault_demo.mlu b/samples/mlu-ops/fault_sample/fault_demo.mlu index 14979ddc2..fecc90a48 100644 --- a/samples/mlu-ops/fault_sample/fault_demo.mlu +++ b/samples/mlu-ops/fault_sample/fault_demo.mlu @@ -91,12 +91,12 @@ int main(int argc, char *argv[]) { printf("memcpy input data from host to device\n"); CNRT_CHECK(cnrtMemcpy(device_ptrs[0], host_ptrs[0], data_size, - CNRT_MEM_TRANS_DIR_HOST2DEV)); + cnrtMemcpyHostToDev)); printf("call device kernel\n"); // set function type and task dim cnrtDim3_t k_dim = {4, 1, 1}; - cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1; + cnrtFunctionType_t k_type = cnrtFuncTypeUnion1; // call device kernel function. MLUUnfinishedExampleKernel<<>>( device_ptrs[0], device_ptrs[1], element_num); @@ -106,7 +106,7 @@ int main(int argc, char *argv[]) { printf("copy result from device to host\n"); CNRT_CHECK(cnrtMemcpy(host_ptrs[1], device_ptrs[1], data_size, - CNRT_MEM_TRANS_DIR_DEV2HOST)); + cnrtMemcpyDevToHost)); printf("free resources\n"); for (size_t i = 0; i < TENSOR_NUM; ++i) { diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/active_rotated_filter_forward/active_rotated_filter_forward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/active_rotated_filter_forward/active_rotated_filter_forward.cpp index 8f7201339..eb903ed7c 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/active_rotated_filter_forward/active_rotated_filter_forward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/active_rotated_filter_forward/active_rotated_filter_forward.cpp @@ -56,7 +56,7 @@ class active_rotated_filter_forward : public testing::Test { } else { i_bytes = 12 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); } - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&input_, i_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_, i_bytes)) } if (indices_desc) { @@ -76,11 +76,11 @@ class active_rotated_filter_forward : public testing::Test { } else { id_bytes = 12 * mluOpDataTypeBytes(MLUOP_DTYPE_INT32); } - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&indices_, id_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indices_, id_bytes)) } if (workspace) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)) } if (output_desc) { @@ -100,7 +100,7 @@ class active_rotated_filter_forward : public testing::Test { } else { o_bytes = 12 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); } - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&input_, o_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_, o_bytes)) } } mluOpStatus_t compute() { @@ -140,7 +140,7 @@ class active_rotated_filter_forward : public testing::Test { } if (input_) { VLOG(4) << "Destroy input_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); input_ = nullptr; } if (indices_desc_) { @@ -150,12 +150,12 @@ class active_rotated_filter_forward : public testing::Test { } if (indices_) { VLOG(4) << "Destroy indices_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indices_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indices_)); indices_ = nullptr; } if (workspace_) { VLOG(4) << "Destroy workspace_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } if (output_desc_) { @@ -165,7 +165,7 @@ class active_rotated_filter_forward : public testing::Test { } if (output_) { VLOG(4) << "Destroy output_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); output_ = nullptr; } } catch (const std::exception &e) { diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/active_rotated_filter_forward/active_rotated_filter_forward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/active_rotated_filter_forward/active_rotated_filter_forward_general.cpp index 8f23f4278..a00f64c42 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/active_rotated_filter_forward/active_rotated_filter_forward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/active_rotated_filter_forward/active_rotated_filter_forward_general.cpp @@ -62,7 +62,7 @@ class active_rotated_filter_forward_general if (input_elenum > 0) { VLOG(4) << "malloc input_"; uint64_t i_bytes = input_elenum * mluOpDataTypeBytes(input_dtype); - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&input_, i_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_, i_bytes)) } MLUOpTensorParam indices_desc = std::get<1>(GetParam()); @@ -85,7 +85,7 @@ class active_rotated_filter_forward_general if (indices_elenum > 0) { VLOG(4) << "malloc indices_"; uint64_t id_bytes = indices_elenum * mluOpDataTypeBytes(indices_dtype); - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&indices_, id_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indices_, id_bytes)) } MLUOpTensorParam output_desc = std::get<2>(GetParam()); @@ -108,7 +108,7 @@ class active_rotated_filter_forward_general if (output_elenum > 0) { VLOG(4) << "malloc output_"; uint64_t o_bytes = output_elenum * mluOpDataTypeBytes(output_dtype); - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&output_, o_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_, o_bytes)) } target_device_ = std::get<3>(GetParam()); @@ -132,7 +132,7 @@ class active_rotated_filter_forward_general destroy(); return expected_status_ == status; } - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)) status = mluOpActiveRotatedFilterForward( handle_, input_desc_, input_, indices_desc_, indices_, workspace_, workspace_size_, output_desc_, output_); @@ -156,7 +156,7 @@ class active_rotated_filter_forward_general } if (input_) { VLOG(4) << "Destroy input_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); input_ = nullptr; } if (indices_desc_) { @@ -166,12 +166,12 @@ class active_rotated_filter_forward_general } if (indices_) { VLOG(4) << "Destroy indices_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indices_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indices_)); indices_ = nullptr; } if (workspace_) { VLOG(4) << "Destroy workspace_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } if (output_desc_) { @@ -181,7 +181,7 @@ class active_rotated_filter_forward_general } if (output_) { VLOG(4) << "Destroy output_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); output_ = nullptr; } } catch (const std::exception &e) { diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/ball_query/ball_query.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/ball_query/ball_query.cpp index 126be7832..e5cc12343 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/ball_query/ball_query.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/ball_query/ball_query.cpp @@ -51,7 +51,7 @@ class ball_query : public testing::Test { size_t new_xyz_ele_num = 2 * 16 * 3; size_t new_xyz_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t new_xyz_bytes = new_xyz_ele_num * new_xyz_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&new_xyz_, new_xyz_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&new_xyz_, new_xyz_bytes)); } if (xyz_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&xyz_desc_)); @@ -64,7 +64,7 @@ class ball_query : public testing::Test { size_t xyz_ele_num = 2 * 4 * 3; size_t xyz_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t xyz_bytes = xyz_ele_num * xyz_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&xyz_, xyz_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&xyz_, xyz_bytes)); } if (idx_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&idx_desc_)); @@ -77,7 +77,7 @@ class ball_query : public testing::Test { size_t idx_ele_num = 2 * 4 * 32; size_t idx_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_INT32); size_t idx_bytes = idx_ele_num * idx_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&idx_, idx_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&idx_, idx_bytes)); } } @@ -101,7 +101,7 @@ class ball_query : public testing::Test { new_xyz_desc_ = NULL; } if (new_xyz_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(new_xyz_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(new_xyz_)); new_xyz_ = NULL; } if (xyz_desc_) { @@ -109,7 +109,7 @@ class ball_query : public testing::Test { xyz_desc_ = NULL; } if (xyz_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(xyz_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(xyz_)); xyz_ = NULL; } if (idx_desc_) { @@ -117,7 +117,7 @@ class ball_query : public testing::Test { idx_desc_ = NULL; } if (idx_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(idx_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(idx_)); idx_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/ball_query/ball_query_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/ball_query/ball_query_general.cpp index ee7b03c12..5e72e2418 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/ball_query/ball_query_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/ball_query/ball_query_general.cpp @@ -64,7 +64,7 @@ class ball_query_general : public testing::TestWithParam { uint64_t xyz_ele_num = mluOpGetTensorElementNum(xyz_desc_); uint64_t xyz_bytes = mluOpDataTypeBytes(xyz_dtype) * xyz_ele_num; if (xyz_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&xyz_, xyz_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&xyz_, xyz_bytes)) } MLUOP_CHECK(mluOpCreateTensorDescriptor(&new_xyz_desc_)); @@ -88,7 +88,7 @@ class ball_query_general : public testing::TestWithParam { uint64_t new_xyz_bytes = mluOpDataTypeBytes(new_xyz_dtype) * new_xyz_ele_num; if (new_xyz_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&new_xyz_, new_xyz_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&new_xyz_, new_xyz_bytes)) } min_radius_ = std::get<2>(GetParam()); @@ -113,7 +113,7 @@ class ball_query_general : public testing::TestWithParam { uint64_t idx_ele_num = mluOpGetTensorElementNum(idx_desc_); uint64_t idx_bytes = mluOpDataTypeBytes(idx_dtype) * idx_ele_num; if (idx_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&idx_, idx_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&idx_, idx_bytes)) } PublicParam publicParam = std::get<6>(GetParam()); @@ -145,7 +145,7 @@ class ball_query_general : public testing::TestWithParam { new_xyz_desc_ = NULL; } if (new_xyz_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(new_xyz_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(new_xyz_)); new_xyz_ = NULL; } if (xyz_desc_) { @@ -153,7 +153,7 @@ class ball_query_general : public testing::TestWithParam { xyz_desc_ = NULL; } if (xyz_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(xyz_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(xyz_)); xyz_ = NULL; } if (idx_desc_) { @@ -161,7 +161,7 @@ class ball_query_general : public testing::TestWithParam { idx_desc_ = NULL; } if (idx_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(idx_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(idx_)); idx_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/border_align_backward/border_align_backward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/border_align_backward/border_align_backward.cpp index 7d1a81a1c..2ef40b1bd 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/border_align_backward/border_align_backward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/border_align_backward/border_align_backward.cpp @@ -49,7 +49,7 @@ class border_align_backward : public testing::Test { grad_output_dims.size(), grad_output_dims.data())); } if (grad_output) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_output_, 16 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } @@ -62,7 +62,7 @@ class border_align_backward : public testing::Test { } if (boxes) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&boxes_, 16 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } if (argmax_idx_desc) { @@ -74,7 +74,7 @@ class border_align_backward : public testing::Test { } if (argmax_idx) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&argmax_idx_, 16 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } if (grad_input_desc) { @@ -86,7 +86,7 @@ class border_align_backward : public testing::Test { } if (grad_input) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_input_, 16 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } } @@ -128,7 +128,7 @@ class border_align_backward : public testing::Test { } if (grad_output_) { VLOG(4) << "Destroy grad_output_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_output_)); grad_output_ = nullptr; } if (boxes_desc_) { @@ -138,7 +138,7 @@ class border_align_backward : public testing::Test { } if (boxes_) { VLOG(4) << "Destroy boxes_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(boxes_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(boxes_)); boxes_ = nullptr; } if (argmax_idx_desc_) { @@ -148,7 +148,7 @@ class border_align_backward : public testing::Test { } if (argmax_idx_) { VLOG(4) << "Destroy argmax_idx_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(argmax_idx_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(argmax_idx_)); argmax_idx_ = nullptr; } if (grad_input_desc_) { @@ -158,7 +158,7 @@ class border_align_backward : public testing::Test { } if (grad_input_) { VLOG(4) << "Destroy grad_input_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_input_)); grad_input_ = nullptr; } } catch (const std::exception &e) { diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/border_align_backward/border_align_backward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/border_align_backward/border_align_backward_general.cpp index cf99e9ed7..eedde939f 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/border_align_backward/border_align_backward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/border_align_backward/border_align_backward_general.cpp @@ -60,10 +60,10 @@ class border_align_backward_general grad_output_params.get_dim_stride().data())); } if (mluOpGetTensorElementNum(grad_output_desc_) >= LARGE_TENSOR_NUM) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&grad_output_, 2 * 16)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_output_, 2 * 16)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_output_, mluOpDataTypeBytes(grad_output_params.get_dtype()) * mluOpGetTensorElementNum(grad_output_desc_))); @@ -82,10 +82,10 @@ class border_align_backward_general boxes_params.get_dim_stride().data())); } if (mluOpGetTensorElementNum(boxes_desc_) >= LARGE_TENSOR_NUM) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&boxes_, 2 * 16)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&boxes_, 2 * 16)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&boxes_, mluOpDataTypeBytes(boxes_params.get_dtype()) * mluOpGetTensorElementNum(boxes_desc_))); } @@ -105,10 +105,10 @@ class border_align_backward_general argmax_idx_params.get_dim_stride().data())); } if (mluOpGetTensorElementNum(argmax_idx_desc_) >= LARGE_TENSOR_NUM) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&argmax_idx_, (2 * 16))); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&argmax_idx_, (2 * 16))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&argmax_idx_, mluOpDataTypeBytes(argmax_idx_params.get_dtype()) * mluOpGetTensorElementNum(argmax_idx_desc_))); @@ -130,10 +130,10 @@ class border_align_backward_general } if (mluOpGetTensorElementNum(grad_input_desc_) >= LARGE_TENSOR_NUM) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&grad_input_, 2 * 16)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_input_, 2 * 16)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_input_, mluOpDataTypeBytes(grad_input_params.get_dtype()) * mluOpGetTensorElementNum(grad_input_desc_))); @@ -176,7 +176,7 @@ class border_align_backward_general } if (grad_output_) { VLOG(4) << "Destroy grad_output_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_output_)); grad_output_ = nullptr; } if (boxes_desc_) { @@ -186,7 +186,7 @@ class border_align_backward_general } if (boxes_) { VLOG(4) << "Destroy boxes_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(boxes_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(boxes_)); boxes_ = nullptr; } if (argmax_idx_desc_) { @@ -196,7 +196,7 @@ class border_align_backward_general } if (argmax_idx_) { VLOG(4) << "Destroy argmax_idx_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(argmax_idx_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(argmax_idx_)); argmax_idx_ = nullptr; } if (grad_input_desc_) { @@ -206,7 +206,7 @@ class border_align_backward_general } if (grad_input_) { VLOG(4) << "Destroy grad_input_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_input_)); grad_input_ = nullptr; } } catch (const std::exception &e) { diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/border_align_forward/border_align_forward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/border_align_forward/border_align_forward.cpp index e9e63a2a9..78233dfab 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/border_align_forward/border_align_forward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/border_align_forward/border_align_forward.cpp @@ -49,7 +49,7 @@ class border_align_forward_test : public testing::Test { } if (input) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&input_, 4000 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } if (boxes_desc) { @@ -61,7 +61,7 @@ class border_align_forward_test : public testing::Test { } if (boxes) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&boxes_, 800 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } if (output_desc) { @@ -73,7 +73,7 @@ class border_align_forward_test : public testing::Test { } if (output) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&output_, 4000 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } if (argmax_idx_desc) { @@ -84,7 +84,7 @@ class border_align_forward_test : public testing::Test { argmax_idx_dims.data())); } if (argmax_idx) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&argmax_idx_, 4000 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } @@ -123,7 +123,7 @@ class border_align_forward_test : public testing::Test { } if (input_) { VLOG(4) << "Destroy input_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); } if (boxes_desc_) { VLOG(4) << "Destroy boxes_desc_"; @@ -131,7 +131,7 @@ class border_align_forward_test : public testing::Test { } if (boxes_) { VLOG(4) << "Destroy boxes_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(boxes_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(boxes_)); } if (output_desc_) { VLOG(4) << "Destroy output_desc_"; @@ -139,7 +139,7 @@ class border_align_forward_test : public testing::Test { } if (output_) { VLOG(4) << "Destroy output_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); } if (argmax_idx_desc_) { VLOG(4) << "Destroy argmax_idx_desc_"; @@ -147,7 +147,7 @@ class border_align_forward_test : public testing::Test { } if (argmax_idx_) { VLOG(4) << "Destroy argmax_idx_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(argmax_idx_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(argmax_idx_)); } } catch (const std::exception &e) { FAIL() << "MLUOPAPIGTEST: catched " << e.what() diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/border_align_forward/border_align_forward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/border_align_forward/border_align_forward_general.cpp index 25c852c67..8880d4c94 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/border_align_forward/border_align_forward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/border_align_forward/border_align_forward_general.cpp @@ -63,7 +63,7 @@ class border_align_forward_general int input_elenum = mluOpGetTensorElementNum(input_desc_); if (input_elenum > 0) { VLOG(4) << "malloc input_"; - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_, input_elenum * mluOpDataTypeBytes(input_dtype))); } @@ -86,7 +86,7 @@ class border_align_forward_general } int boxes_elenum = mluOpGetTensorElementNum(boxes_desc_); if (boxes_elenum > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&boxes_, boxes_elenum * mluOpDataTypeBytes(boxes_dtype))); } @@ -109,7 +109,7 @@ class border_align_forward_general } int output_elenum = mluOpGetTensorElementNum(output_desc_); if (output_elenum > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_, output_elenum * mluOpDataTypeBytes(output_dtype))); } @@ -134,7 +134,7 @@ class border_align_forward_general int argmax_idx_elenum = mluOpGetTensorElementNum(argmax_idx_desc_); if (argmax_idx_elenum > 0) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&argmax_idx_, argmax_idx_elenum * mluOpDataTypeBytes(argmax_idx_dtype))); } @@ -174,7 +174,7 @@ class border_align_forward_general } if (input_) { VLOG(4) << "Destroy input_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); } if (boxes_desc_) { VLOG(4) << "Destroy boxes_desc_"; @@ -182,7 +182,7 @@ class border_align_forward_general } if (boxes_) { VLOG(4) << "Destroy boxes_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(boxes_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(boxes_)); } if (output_desc_) { VLOG(4) << "Destroy output_desc_"; @@ -190,7 +190,7 @@ class border_align_forward_general } if (output_) { VLOG(4) << "Destroy output_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); } if (argmax_idx_desc_) { VLOG(4) << "Destroy argmax_idx_desc_"; @@ -198,7 +198,7 @@ class border_align_forward_general } if (argmax_idx_) { VLOG(4) << "Destroy argmax_idx_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(argmax_idx_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(argmax_idx_)); } } catch (const std::exception &e) { FAIL() << "MLUOPAPIGTEST: catched " << e.what() diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/deform_roi_pooling_backward/deform_roi_pooling_backward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/deform_roi_pooling_backward/deform_roi_pooling_backward.cpp index 7840da32c..66189f1f9 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/deform_roi_pooling_backward/deform_roi_pooling_backward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/deform_roi_pooling_backward/deform_roi_pooling_backward.cpp @@ -48,7 +48,7 @@ class deform_roi_pooling_backward : public testing::Test { std::vector({3, 3, 3, 1}).data())); } if (grad_output) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&grad_output_, 10)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_output_, 10)); } if (input_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&input_desc_)); @@ -57,7 +57,7 @@ class deform_roi_pooling_backward : public testing::Test { std::vector({1, 5, 5, 1}).data())); } if (input) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&input_, 10)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_, 10)); } if (rois_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&rois_desc_)); @@ -66,7 +66,7 @@ class deform_roi_pooling_backward : public testing::Test { std::vector({3, 5}).data())); } if (rois) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&rois_, 5)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&rois_, 5)); } if (offset_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&offset_desc_)); @@ -75,7 +75,7 @@ class deform_roi_pooling_backward : public testing::Test { std::vector({3, 2, 3, 3}).data())); } if (offset) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&offset_, 5)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&offset_, 5)); } if (grad_input_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&grad_input_desc_)); @@ -84,7 +84,7 @@ class deform_roi_pooling_backward : public testing::Test { std::vector({1, 5, 5, 1}).data())); } if (grad_input_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&grad_input_, 10)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_input_, 10)); } if (grad_offset_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&grad_offset_desc_)); @@ -93,7 +93,7 @@ class deform_roi_pooling_backward : public testing::Test { std::vector({3, 2, 3, 3}).data())); } if (grad_offset) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&grad_offset_, 10)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_offset_, 10)); } } mluOpStatus_t compute() { @@ -135,7 +135,7 @@ class deform_roi_pooling_backward : public testing::Test { } if (grad_output_) { VLOG(4) << "Destroy grad_output"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_output_)); grad_output_ = nullptr; } if (input_desc_) { @@ -144,7 +144,7 @@ class deform_roi_pooling_backward : public testing::Test { } if (input_) { VLOG(4) << "Destroy input"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); input_ = nullptr; } if (rois_desc_) { @@ -153,7 +153,7 @@ class deform_roi_pooling_backward : public testing::Test { } if (rois_) { VLOG(4) << "Destroy rois"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(rois_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(rois_)); rois_ = nullptr; } if (offset_desc_) { @@ -162,7 +162,7 @@ class deform_roi_pooling_backward : public testing::Test { } if (offset_) { VLOG(4) << "Destroy offset"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(offset_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(offset_)); offset_ = nullptr; } if (grad_input_desc_) { @@ -171,7 +171,7 @@ class deform_roi_pooling_backward : public testing::Test { } if (grad_input_) { VLOG(4) << "Destroy grad_input"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_input_)); grad_input_ = nullptr; } if (grad_offset_desc_) { @@ -180,7 +180,7 @@ class deform_roi_pooling_backward : public testing::Test { } if (grad_offset_) { VLOG(4) << "Destroy grad_offset"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_offset_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_offset_)); grad_offset_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/deform_roi_pooling_backward/deform_roi_pooling_backward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/deform_roi_pooling_backward/deform_roi_pooling_backward_general.cpp index 718ad39de..5f9c1359a 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/deform_roi_pooling_backward/deform_roi_pooling_backward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/deform_roi_pooling_backward/deform_roi_pooling_backward_general.cpp @@ -64,7 +64,7 @@ class deform_roi_pooling_backward_general uint64_t go_ele_num = mluOpGetTensorElementNum(grad_output_desc_); uint64_t go_bytes = mluOpDataTypeBytes(grad_output_dtype) * go_ele_num; if (go_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&grad_output_, go_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_output_, go_bytes)); } MLUOpTensorParam inputDescParam = std::get<1>(GetParam()); @@ -79,7 +79,7 @@ class deform_roi_pooling_backward_general uint64_t i_ele_num = mluOpGetTensorElementNum(input_desc_); uint64_t i_bytes = mluOpDataTypeBytes(input_dtype) * i_ele_num; if (i_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&input_, i_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_, i_bytes)); } MLUOpTensorParam roisDescParam = std::get<2>(GetParam()); @@ -93,7 +93,7 @@ class deform_roi_pooling_backward_general uint64_t roi_ele_num = mluOpGetTensorElementNum(rois_desc_); uint64_t roi_bytes = mluOpDataTypeBytes(rois_dtype) * roi_ele_num; if (roi_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&rois_, roi_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&rois_, roi_bytes)); } MLUOpTensorParam offsetDescParam = std::get<3>(GetParam()); @@ -108,7 +108,7 @@ class deform_roi_pooling_backward_general uint64_t offset_ele_num = mluOpGetTensorElementNum(offset_desc_); uint64_t offset_bytes = mluOpDataTypeBytes(offset_dtype) * offset_ele_num; if (offset_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&offset_, offset_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&offset_, offset_bytes)); } MLUOpTensorParam gradInputDescParam = std::get<4>(GetParam()); @@ -123,7 +123,7 @@ class deform_roi_pooling_backward_general uint64_t gi_ele_num = mluOpGetTensorElementNum(grad_input_desc_); uint64_t gi_bytes = mluOpDataTypeBytes(grad_input_dtype) * gi_ele_num; if (gi_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&grad_input_, gi_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_input_, gi_bytes)); } MLUOpTensorParam gradOffsetDescParam = std::get<5>(GetParam()); @@ -138,7 +138,7 @@ class deform_roi_pooling_backward_general uint64_t gf_ele_num = mluOpGetTensorElementNum(grad_offset_desc_); uint64_t gf_bytes = mluOpDataTypeBytes(grad_offset_dtype) * gf_ele_num; if (gf_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&grad_offset_, gf_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_offset_, gf_bytes)); } DeformRoiPoolingBackwardAdditionalParam additoinal_param_ = @@ -180,7 +180,7 @@ class deform_roi_pooling_backward_general } if (grad_output_) { VLOG(4) << "Destroy grad_output"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_output_)); grad_output_ = nullptr; } if (input_desc_) { @@ -189,7 +189,7 @@ class deform_roi_pooling_backward_general } if (input_) { VLOG(4) << "Destroy input"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); input_ = nullptr; } if (rois_desc_) { @@ -198,7 +198,7 @@ class deform_roi_pooling_backward_general } if (rois_) { VLOG(4) << "Destroy rois"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(rois_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(rois_)); rois_ = nullptr; } if (offset_desc_) { @@ -207,7 +207,7 @@ class deform_roi_pooling_backward_general } if (offset_) { VLOG(4) << "Destroy offset"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(offset_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(offset_)); offset_ = nullptr; } if (grad_input_desc_) { @@ -216,7 +216,7 @@ class deform_roi_pooling_backward_general } if (grad_input_) { VLOG(4) << "Destroy grad_input"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_input_)); grad_input_ = nullptr; } if (grad_offset_desc_) { @@ -225,7 +225,7 @@ class deform_roi_pooling_backward_general } if (grad_offset_) { VLOG(4) << "Destroy grad_offset"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_offset_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_offset_)); grad_offset_ = nullptr; } } catch (const std::exception &e) { diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/deform_roi_pooling_forward/deform_roi_pooling_forward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/deform_roi_pooling_forward/deform_roi_pooling_forward.cpp index 8923cd450..21bfc0442 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/deform_roi_pooling_forward/deform_roi_pooling_forward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/deform_roi_pooling_forward/deform_roi_pooling_forward.cpp @@ -47,7 +47,7 @@ class deform_roi_pooling_forward : public testing::Test { std::vector({1, 5, 5, 1}).data())); } if (input) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&input_, 10)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_, 10)); } if (rois_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&rois_desc_)); @@ -56,7 +56,7 @@ class deform_roi_pooling_forward : public testing::Test { std::vector({3, 5}).data())); } if (rois) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&rois_, 5)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&rois_, 5)); } if (offset_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&offset_desc_)); @@ -65,7 +65,7 @@ class deform_roi_pooling_forward : public testing::Test { std::vector({3, 2, 3, 3}).data())); } if (offset) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&offset_, 5)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&offset_, 5)); } if (output_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&output_desc_)); @@ -74,7 +74,7 @@ class deform_roi_pooling_forward : public testing::Test { std::vector({3, 3, 3, 1}).data())); } if (output) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&output_, 10)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_, 10)); } } mluOpStatus_t compute() { @@ -111,7 +111,7 @@ class deform_roi_pooling_forward : public testing::Test { } if (input_) { VLOG(4) << "Destroy input"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); input_ = nullptr; } if (rois_desc_) { @@ -120,7 +120,7 @@ class deform_roi_pooling_forward : public testing::Test { } if (rois_) { VLOG(4) << "Destroy rois"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(rois_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(rois_)); rois_ = nullptr; } if (offset_desc_) { @@ -129,7 +129,7 @@ class deform_roi_pooling_forward : public testing::Test { } if (offset_) { VLOG(4) << "Destroy offset"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(offset_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(offset_)); offset_ = nullptr; } if (output_desc_) { @@ -138,7 +138,7 @@ class deform_roi_pooling_forward : public testing::Test { } if (output_) { VLOG(4) << "Destroy output"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); output_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/deform_roi_pooling_forward/deform_roi_pooling_forward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/deform_roi_pooling_forward/deform_roi_pooling_forward_general.cpp index 42312fe99..920c2aa8a 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/deform_roi_pooling_forward/deform_roi_pooling_forward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/deform_roi_pooling_forward/deform_roi_pooling_forward_general.cpp @@ -62,7 +62,7 @@ class deform_roi_pooling_forward_general uint64_t i_ele_num = mluOpGetTensorElementNum(input_desc_); uint64_t i_bytes = mluOpDataTypeBytes(input_dtype) * i_ele_num; if (i_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&input_, i_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_, i_bytes)); } MLUOpTensorParam roisDescParam = std::get<1>(GetParam()); @@ -76,7 +76,7 @@ class deform_roi_pooling_forward_general uint64_t roi_ele_num = mluOpGetTensorElementNum(rois_desc_); uint64_t roi_bytes = mluOpDataTypeBytes(rois_dtype) * roi_ele_num; if (roi_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&rois_, roi_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&rois_, roi_bytes)); } MLUOpTensorParam offsetDescParam = std::get<2>(GetParam()); @@ -91,7 +91,7 @@ class deform_roi_pooling_forward_general uint64_t offset_ele_num = mluOpGetTensorElementNum(offset_desc_); uint64_t offset_bytes = mluOpDataTypeBytes(offset_dtype) * offset_ele_num; if (offset_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&offset_, offset_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&offset_, offset_bytes)); } DeformRoiPoolingForwardAdditionalParam additoinal_param_ = @@ -111,7 +111,7 @@ class deform_roi_pooling_forward_general uint64_t o_ele_num = mluOpGetTensorElementNum(output_desc_); uint64_t o_bytes = mluOpDataTypeBytes(output_dtype) * o_ele_num; if (o_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&output_, o_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_, o_bytes)); } } catch (const std::exception &e) { FAIL() << "MLUOPAPIGTEST: catched " << e.what() @@ -147,7 +147,7 @@ class deform_roi_pooling_forward_general } if (input_) { VLOG(4) << "Destroy input"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); input_ = nullptr; } if (rois_desc_) { @@ -156,7 +156,7 @@ class deform_roi_pooling_forward_general } if (rois_) { VLOG(4) << "Destroy rois"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(rois_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(rois_)); rois_ = nullptr; } if (offset_desc_) { @@ -165,7 +165,7 @@ class deform_roi_pooling_forward_general } if (offset_) { VLOG(4) << "Destroy offset"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(offset_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(offset_)); offset_ = nullptr; } if (output_desc_) { @@ -174,7 +174,7 @@ class deform_roi_pooling_forward_general } if (output_) { VLOG(4) << "Destroy output"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); output_ = nullptr; } } catch (const std::exception &e) { diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward.cpp index 305b00c99..824f679ef 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward.cpp @@ -57,12 +57,12 @@ class diff_iou_rotated_sort_vertices_forward : public testing::Test { if (vertices) { if (vertices_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&vertices_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum( vertices_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&vertices_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -77,11 +77,11 @@ class diff_iou_rotated_sort_vertices_forward : public testing::Test { if (mask) { if (mask_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&mask_, MLUOP_DTYPE_BOOL * mluOpGetTensorElementNum(mask_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&mask_, MLUOP_DTYPE_BOOL * 2)); } } @@ -96,12 +96,12 @@ class diff_iou_rotated_sort_vertices_forward : public testing::Test { if (num_valid) { if (num_valid_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&num_valid_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum( num_valid_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&num_valid_, MLUOP_DTYPE_INT32 * 2)); } } @@ -116,11 +116,11 @@ class diff_iou_rotated_sort_vertices_forward : public testing::Test { if (idx) { if (idx_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&idx_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum(idx_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&idx_, MLUOP_DTYPE_INT32 * 2)); } } @@ -151,7 +151,7 @@ class diff_iou_rotated_sort_vertices_forward : public testing::Test { if (vertices_) { VLOG(4) << "Destroy vertices_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(vertices_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(vertices_)); vertices_ = nullptr; } @@ -163,7 +163,7 @@ class diff_iou_rotated_sort_vertices_forward : public testing::Test { if (mask_) { VLOG(4) << "Destroy mask_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(mask_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(mask_)); mask_ = nullptr; } @@ -175,7 +175,7 @@ class diff_iou_rotated_sort_vertices_forward : public testing::Test { if (num_valid_) { VLOG(4) << "Destroy num_valid_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(num_valid_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(num_valid_)); num_valid_ = nullptr; } @@ -187,7 +187,7 @@ class diff_iou_rotated_sort_vertices_forward : public testing::Test { if (idx_) { VLOG(4) << "Destroy idx_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(idx_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(idx_)); idx_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward_general.cpp index f622b5ec0..7a35d99b2 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward_general.cpp @@ -52,12 +52,12 @@ class diff_iou_rotated_sort_vertices_forward_general vertices_params.get_dim_size().data())); if (mluOpGetTensorElementNum(vertices_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&vertices_, mluOpDataTypeBytes(vertices_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&vertices_, mluOpDataTypeBytes(vertices_params.get_dtype()) * mluOpGetTensorElementNum(vertices_desc_))); @@ -70,12 +70,12 @@ class diff_iou_rotated_sort_vertices_forward_general mask_params.get_dim_nb(), mask_params.get_dim_size().data())); if (mluOpGetTensorElementNum(mask_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&mask_, mluOpDataTypeBytes(mask_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&mask_, mluOpDataTypeBytes(mask_params.get_dtype()) * mluOpGetTensorElementNum(mask_desc_))); @@ -89,13 +89,13 @@ class diff_iou_rotated_sort_vertices_forward_general num_valid_params.get_dim_size().data())); if (mluOpGetTensorElementNum(num_valid_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &num_valid_, mluOpDataTypeBytes(num_valid_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&num_valid_, mluOpDataTypeBytes(num_valid_params.get_dtype()) * mluOpGetTensorElementNum(num_valid_desc_))); @@ -108,11 +108,11 @@ class diff_iou_rotated_sort_vertices_forward_general idx_params.get_dim_nb(), idx_params.get_dim_size().data())); if (mluOpGetTensorElementNum(idx_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&idx_, mluOpDataTypeBytes(idx_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&idx_, mluOpDataTypeBytes(idx_params.get_dtype()) * mluOpGetTensorElementNum(idx_desc_))); } @@ -154,7 +154,7 @@ class diff_iou_rotated_sort_vertices_forward_general if (vertices_) { VLOG(4) << "Destroy vertices_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(vertices_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(vertices_)); vertices_ = nullptr; } @@ -166,7 +166,7 @@ class diff_iou_rotated_sort_vertices_forward_general if (mask_) { VLOG(4) << "Destroy mask_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(mask_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(mask_)); mask_ = nullptr; } @@ -178,7 +178,7 @@ class diff_iou_rotated_sort_vertices_forward_general if (num_valid_) { VLOG(4) << "Destroy num_valid_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(num_valid_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(num_valid_)); num_valid_ = nullptr; } @@ -190,7 +190,7 @@ class diff_iou_rotated_sort_vertices_forward_general if (idx_) { VLOG(4) << "Destroy idx_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(idx_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(idx_)); idx_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.cpp index d4422363c..0591586c4 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.cpp @@ -55,12 +55,12 @@ class dynamic_point_to_voxel_backward : public testing::Test { if (grad_voxel_feats) { if (grad_voxel_feats_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_voxel_feats_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(grad_voxel_feats_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_voxel_feats_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -76,11 +76,11 @@ class dynamic_point_to_voxel_backward : public testing::Test { if (feats) { if (feats_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&feats_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(feats_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&feats_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -95,12 +95,12 @@ class dynamic_point_to_voxel_backward : public testing::Test { if (voxel_feats) { if (voxel_feats_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&voxel_feats_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum( voxel_feats_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&voxel_feats_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -115,12 +115,12 @@ class dynamic_point_to_voxel_backward : public testing::Test { if (point2voxel_map) { if (point2voxel_map_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&point2voxel_map_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum( point2voxel_map_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&point2voxel_map_, MLUOP_DTYPE_INT32 * 2)); } } @@ -136,12 +136,12 @@ class dynamic_point_to_voxel_backward : public testing::Test { if (voxel_points_count) { if (voxel_points_count_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&voxel_points_count_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum(voxel_points_count_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&voxel_points_count_, MLUOP_DTYPE_INT32 * 2)); } } @@ -156,18 +156,18 @@ class dynamic_point_to_voxel_backward : public testing::Test { if (voxel_num) { if (voxel_num_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&voxel_num_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum(voxel_num_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&voxel_num_, MLUOP_DTYPE_INT32 * 1)); } } if (workspace) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, MLUOP_DTYPE_INT32 * workspace_size_)); } @@ -181,12 +181,12 @@ class dynamic_point_to_voxel_backward : public testing::Test { if (grad_feats) { if (grad_feats_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_feats_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(grad_feats_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_feats_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -220,7 +220,7 @@ class dynamic_point_to_voxel_backward : public testing::Test { if (grad_voxel_feats_) { VLOG(4) << "Destroy grad_voxel_feats_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_voxel_feats_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_voxel_feats_)); grad_voxel_feats_ = nullptr; } @@ -232,7 +232,7 @@ class dynamic_point_to_voxel_backward : public testing::Test { if (feats_) { VLOG(4) << "Destroy feats_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(feats_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(feats_)); feats_ = nullptr; } @@ -244,7 +244,7 @@ class dynamic_point_to_voxel_backward : public testing::Test { if (voxel_feats_) { VLOG(4) << "Destroy voxel_feats_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(voxel_feats_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(voxel_feats_)); voxel_feats_ = nullptr; } @@ -256,7 +256,7 @@ class dynamic_point_to_voxel_backward : public testing::Test { if (point2voxel_map_) { VLOG(4) << "Destroy point2voxel_map_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(point2voxel_map_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(point2voxel_map_)); point2voxel_map_ = nullptr; } @@ -268,7 +268,7 @@ class dynamic_point_to_voxel_backward : public testing::Test { if (voxel_points_count_) { VLOG(4) << "Destroy voxel_points_count_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(voxel_points_count_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(voxel_points_count_)); voxel_points_count_ = nullptr; } @@ -280,13 +280,13 @@ class dynamic_point_to_voxel_backward : public testing::Test { if (voxel_num_) { VLOG(4) << "Destroy voxel_num_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(voxel_num_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(voxel_num_)); voxel_num_ = nullptr; } if (workspace_) { VLOG(4) << "Destroy workspace_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } @@ -298,7 +298,7 @@ class dynamic_point_to_voxel_backward : public testing::Test { if (grad_feats_) { VLOG(4) << "Destroy grad_feats_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_feats_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_feats_)); grad_feats_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward_general.cpp index 1cf57d0f6..503344e8f 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward_general.cpp @@ -57,13 +57,13 @@ class dynamic_point_to_voxel_backward_general if (mluOpGetTensorElementNum(grad_voxel_feats_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &grad_voxel_feats_, mluOpDataTypeBytes(grad_voxel_feats_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_voxel_feats_, mluOpDataTypeBytes(grad_voxel_feats_params.get_dtype()) * mluOpGetTensorElementNum(grad_voxel_feats_desc_))); @@ -75,12 +75,12 @@ class dynamic_point_to_voxel_backward_general feats_params.get_dim_nb(), feats_params.get_dim_size().data())); if (mluOpGetTensorElementNum(feats_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&feats_, mluOpDataTypeBytes(feats_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&feats_, mluOpDataTypeBytes(feats_params.get_dtype()) * mluOpGetTensorElementNum(feats_desc_))); } @@ -93,12 +93,12 @@ class dynamic_point_to_voxel_backward_general voxel_feats_params.get_dim_size().data())); if (mluOpGetTensorElementNum(voxel_feats_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&voxel_feats_, mluOpDataTypeBytes(voxel_feats_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&voxel_feats_, mluOpDataTypeBytes(voxel_feats_params.get_dtype()) * mluOpGetTensorElementNum(voxel_feats_desc_))); @@ -113,13 +113,13 @@ class dynamic_point_to_voxel_backward_general point2voxel_map_params.get_dim_size().data())); if (mluOpGetTensorElementNum(point2voxel_map_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &point2voxel_map_, mluOpDataTypeBytes(point2voxel_map_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&point2voxel_map_, mluOpDataTypeBytes(point2voxel_map_params.get_dtype()) * mluOpGetTensorElementNum(point2voxel_map_desc_))); @@ -135,13 +135,13 @@ class dynamic_point_to_voxel_backward_general if (mluOpGetTensorElementNum(voxel_points_count_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &voxel_points_count_, mluOpDataTypeBytes(voxel_points_count_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &voxel_points_count_, mluOpDataTypeBytes(voxel_points_count_params.get_dtype()) * @@ -156,12 +156,12 @@ class dynamic_point_to_voxel_backward_general voxel_num_params.get_dim_size().data())); if (mluOpGetTensorElementNum(voxel_num_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&voxel_num_, mluOpDataTypeBytes(voxel_num_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&voxel_num_, mluOpDataTypeBytes(voxel_num_params.get_dtype()) * mluOpGetTensorElementNum(voxel_num_desc_))); @@ -175,12 +175,12 @@ class dynamic_point_to_voxel_backward_general grad_feats_params.get_dim_size().data())); if (mluOpGetTensorElementNum(grad_feats_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_feats_, mluOpDataTypeBytes(grad_feats_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_feats_, mluOpDataTypeBytes(grad_feats_params.get_dtype()) * mluOpGetTensorElementNum(grad_feats_desc_))); @@ -188,7 +188,7 @@ class dynamic_point_to_voxel_backward_general target_device_ = std::get<8>(GetParam()); expected_status_ = std::get<9>(GetParam()); - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, MLUOP_DTYPE_FLOAT * workspace_size_)); } catch (const std::exception &e) { FAIL() << "MLUOPAPIGTEST: catched " << e.what() @@ -228,7 +228,7 @@ class dynamic_point_to_voxel_backward_general if (grad_voxel_feats_) { VLOG(4) << "Destroy grad_voxel_feats_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_voxel_feats_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_voxel_feats_)); grad_voxel_feats_ = nullptr; } @@ -240,7 +240,7 @@ class dynamic_point_to_voxel_backward_general if (feats_) { VLOG(4) << "Destroy feats_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(feats_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(feats_)); feats_ = nullptr; } @@ -252,7 +252,7 @@ class dynamic_point_to_voxel_backward_general if (voxel_feats_) { VLOG(4) << "Destroy voxel_feats_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(voxel_feats_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(voxel_feats_)); voxel_feats_ = nullptr; } @@ -264,7 +264,7 @@ class dynamic_point_to_voxel_backward_general if (point2voxel_map_) { VLOG(4) << "Destroy point2voxel_map_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(point2voxel_map_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(point2voxel_map_)); point2voxel_map_ = nullptr; } @@ -276,7 +276,7 @@ class dynamic_point_to_voxel_backward_general if (voxel_points_count_) { VLOG(4) << "Destroy voxel_points_count_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(voxel_points_count_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(voxel_points_count_)); voxel_points_count_ = nullptr; } @@ -288,13 +288,13 @@ class dynamic_point_to_voxel_backward_general if (voxel_num_) { VLOG(4) << "Destroy voxel_num_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(voxel_num_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(voxel_num_)); voxel_num_ = nullptr; } if (workspace_) { VLOG(4) << "Destroy workspace_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } @@ -306,7 +306,7 @@ class dynamic_point_to_voxel_backward_general if (grad_feats_) { VLOG(4) << "Destroy grad_feats_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_feats_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_feats_)); grad_feats_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.cpp index e35581084..4a5b4e7fe 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.cpp @@ -55,11 +55,11 @@ class dynamic_point_to_voxel_forward : public testing::Test { if (feats) { if (feats_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&feats_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(feats_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&feats_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -75,11 +75,11 @@ class dynamic_point_to_voxel_forward : public testing::Test { if (coors) { if (coors_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&coors_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum(coors_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&coors_, MLUOP_DTYPE_INT32 * 2)); } } @@ -94,12 +94,12 @@ class dynamic_point_to_voxel_forward : public testing::Test { if (voxel_feats) { if (voxel_feats_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&voxel_feats_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum( voxel_feats_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&voxel_feats_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -114,12 +114,12 @@ class dynamic_point_to_voxel_forward : public testing::Test { if (voxel_coors) { if (voxel_coors_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&voxel_coors_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum( voxel_coors_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&voxel_coors_, MLUOP_DTYPE_INT32 * 2)); } } @@ -134,12 +134,12 @@ class dynamic_point_to_voxel_forward : public testing::Test { if (point2voxel_map) { if (point2voxel_map_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&point2voxel_map_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum( point2voxel_map_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&point2voxel_map_, MLUOP_DTYPE_INT32 * 2)); } } @@ -155,12 +155,12 @@ class dynamic_point_to_voxel_forward : public testing::Test { if (voxel_points_count) { if (voxel_points_count_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&voxel_points_count_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum(voxel_points_count_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&voxel_points_count_, MLUOP_DTYPE_INT32 * 2)); } } @@ -175,18 +175,18 @@ class dynamic_point_to_voxel_forward : public testing::Test { if (voxel_num) { if (voxel_num_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&voxel_num_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum(voxel_num_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&voxel_num_, MLUOP_DTYPE_INT32 * 1)); } } if (workspace) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, MLUOP_DTYPE_INT32 * workspace_size_)); } } @@ -219,7 +219,7 @@ class dynamic_point_to_voxel_forward : public testing::Test { if (feats_) { VLOG(4) << "Destroy feats_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(feats_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(feats_)); feats_ = nullptr; } @@ -231,7 +231,7 @@ class dynamic_point_to_voxel_forward : public testing::Test { if (coors_) { VLOG(4) << "Destroy coors_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(coors_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(coors_)); coors_ = nullptr; } @@ -243,7 +243,7 @@ class dynamic_point_to_voxel_forward : public testing::Test { if (voxel_feats_) { VLOG(4) << "Destroy voxel_feats_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(voxel_feats_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(voxel_feats_)); voxel_feats_ = nullptr; } @@ -255,7 +255,7 @@ class dynamic_point_to_voxel_forward : public testing::Test { if (voxel_coors_) { VLOG(4) << "Destroy voxel_coors_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(voxel_coors_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(voxel_coors_)); voxel_coors_ = nullptr; } @@ -267,7 +267,7 @@ class dynamic_point_to_voxel_forward : public testing::Test { if (point2voxel_map_) { VLOG(4) << "Destroy point2voxel_map_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(point2voxel_map_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(point2voxel_map_)); point2voxel_map_ = nullptr; } @@ -279,7 +279,7 @@ class dynamic_point_to_voxel_forward : public testing::Test { if (voxel_points_count_) { VLOG(4) << "Destroy voxel_points_count_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(voxel_points_count_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(voxel_points_count_)); voxel_points_count_ = nullptr; } @@ -291,13 +291,13 @@ class dynamic_point_to_voxel_forward : public testing::Test { if (voxel_num_) { VLOG(4) << "Destroy voxel_num_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(voxel_num_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(voxel_num_)); voxel_num_ = nullptr; } if (workspace_) { VLOG(4) << "Destroy workspace_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward_general.cpp index 95e2a4a0f..1b6a4c933 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward_general.cpp @@ -53,12 +53,12 @@ class dynamic_point_to_voxel_forward_general feats_params.get_dim_nb(), feats_params.get_dim_size().data())); if (mluOpGetTensorElementNum(feats_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&feats_, mluOpDataTypeBytes(feats_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&feats_, mluOpDataTypeBytes(feats_params.get_dtype()) * mluOpGetTensorElementNum(feats_desc_))); } @@ -70,12 +70,12 @@ class dynamic_point_to_voxel_forward_general coors_params.get_dim_nb(), coors_params.get_dim_size().data())); if (mluOpGetTensorElementNum(coors_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&coors_, mluOpDataTypeBytes(coors_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&coors_, mluOpDataTypeBytes(coors_params.get_dtype()) * mluOpGetTensorElementNum(coors_desc_))); } @@ -88,12 +88,12 @@ class dynamic_point_to_voxel_forward_general voxel_feats_params.get_dim_size().data())); if (mluOpGetTensorElementNum(voxel_feats_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&voxel_feats_, mluOpDataTypeBytes(voxel_feats_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&voxel_feats_, mluOpDataTypeBytes(voxel_feats_params.get_dtype()) * mluOpGetTensorElementNum(voxel_feats_desc_))); @@ -107,12 +107,12 @@ class dynamic_point_to_voxel_forward_general voxel_coors_params.get_dim_size().data())); if (mluOpGetTensorElementNum(voxel_coors_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&voxel_coors_, mluOpDataTypeBytes(voxel_coors_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&voxel_coors_, mluOpDataTypeBytes(voxel_coors_params.get_dtype()) * mluOpGetTensorElementNum(voxel_coors_desc_))); @@ -127,13 +127,13 @@ class dynamic_point_to_voxel_forward_general point2voxel_map_params.get_dim_size().data())); if (mluOpGetTensorElementNum(point2voxel_map_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &point2voxel_map_, mluOpDataTypeBytes(point2voxel_map_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&point2voxel_map_, mluOpDataTypeBytes(point2voxel_map_params.get_dtype()) * mluOpGetTensorElementNum(point2voxel_map_desc_))); @@ -149,13 +149,13 @@ class dynamic_point_to_voxel_forward_general if (mluOpGetTensorElementNum(voxel_points_count_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &voxel_points_count_, mluOpDataTypeBytes(voxel_points_count_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &voxel_points_count_, mluOpDataTypeBytes(voxel_points_count_params.get_dtype()) * @@ -170,12 +170,12 @@ class dynamic_point_to_voxel_forward_general voxel_num_params.get_dim_size().data())); if (mluOpGetTensorElementNum(voxel_num_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&voxel_num_, mluOpDataTypeBytes(voxel_num_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&voxel_num_, mluOpDataTypeBytes(voxel_num_params.get_dtype()) * mluOpGetTensorElementNum(voxel_num_desc_))); @@ -185,7 +185,7 @@ class dynamic_point_to_voxel_forward_general target_device_ = std::get<8>(GetParam()); expected_status_ = std::get<9>(GetParam()); - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, MLUOP_DTYPE_FLOAT * workspace_size_)); } catch (const std::exception &e) { FAIL() << "MLUOPAPIGTEST: catched " << e.what() @@ -225,7 +225,7 @@ class dynamic_point_to_voxel_forward_general if (feats_) { VLOG(4) << "Destroy feats_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(feats_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(feats_)); feats_ = nullptr; } @@ -237,7 +237,7 @@ class dynamic_point_to_voxel_forward_general if (coors_) { VLOG(4) << "Destroy coors_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(coors_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(coors_)); coors_ = nullptr; } @@ -249,7 +249,7 @@ class dynamic_point_to_voxel_forward_general if (voxel_feats_) { VLOG(4) << "Destroy voxel_feats_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(voxel_feats_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(voxel_feats_)); voxel_feats_ = nullptr; } @@ -261,7 +261,7 @@ class dynamic_point_to_voxel_forward_general if (voxel_coors_) { VLOG(4) << "Destroy voxel_coors_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(voxel_coors_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(voxel_coors_)); voxel_coors_ = nullptr; } @@ -273,7 +273,7 @@ class dynamic_point_to_voxel_forward_general if (point2voxel_map_) { VLOG(4) << "Destroy point2voxel_map_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(point2voxel_map_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(point2voxel_map_)); point2voxel_map_ = nullptr; } @@ -285,7 +285,7 @@ class dynamic_point_to_voxel_forward_general if (voxel_points_count_) { VLOG(4) << "Destroy voxel_points_count_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(voxel_points_count_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(voxel_points_count_)); voxel_points_count_ = nullptr; } @@ -297,13 +297,13 @@ class dynamic_point_to_voxel_forward_general if (voxel_num_) { VLOG(4) << "Destroy voxel_num_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(voxel_num_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(voxel_num_)); voxel_num_ = nullptr; } if (workspace_) { VLOG(4) << "Destroy workspace_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/fft/fft_ExecFFT.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/fft/fft_ExecFFT.cpp index c2fd648a0..90776db07 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/fft/fft_ExecFFT.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/fft/fft_ExecFFT.cpp @@ -47,16 +47,16 @@ class fft_ExecFFT : public testing::Test { if (input) { size_t i_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&input_, i_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_, i_bytes)); } if (workspace) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)); } if (output) { size_t o_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_COMPLEX_FLOAT); - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&output_, o_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_, o_bytes)); } } @@ -136,12 +136,12 @@ class fft_ExecFFT : public testing::Test { } if (input_) { VLOG(4) << "Destroy input_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); input_ = nullptr; } if (output_) { VLOG(4) << "Destroy output_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); output_ = nullptr; } if (fft_plan_) { @@ -151,7 +151,7 @@ class fft_ExecFFT : public testing::Test { } if (workspace_) { VLOG(4) << "Destroy workspace_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } } catch (const std::exception &e) { diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/fft/fft_SetFFTReserveArea.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/fft/fft_SetFFTReserveArea.cpp index a1610d0a1..3c36a4278 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/fft/fft_SetFFTReserveArea.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/fft/fft_SetFFTReserveArea.cpp @@ -44,7 +44,7 @@ class fft_SetFFTReserveArea : public testing::Test { } if (reservespace) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&reservespace_, reservespace_size)); } } @@ -79,7 +79,7 @@ class fft_SetFFTReserveArea : public testing::Test { } if (reservespace_) { VLOG(4) << "Destroy reservespace_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(reservespace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(reservespace_)); reservespace_ = nullptr; } } catch (const std::exception &e) { diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/fft/fft_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/fft/fft_general.cpp index c47f15fcb..4fa250c75 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/fft/fft_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/fft/fft_general.cpp @@ -106,12 +106,12 @@ class fft_general : public testing::TestWithParam { } if (workspace_size_) { VLOG(4) << "Destroy workspace_size_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_size_)); workspace_size_ = nullptr; } if (reservespace_size_) { VLOG(4) << "Destroy reservespace_size_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(reservespace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(reservespace_size_)); reservespace_size_ = nullptr; } } catch (const std::exception &e) { diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/focal_loss_sigmoid_backward/focal_loss_sigmoid_backward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/focal_loss_sigmoid_backward/focal_loss_sigmoid_backward.cpp index d0252d084..e55385107 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/focal_loss_sigmoid_backward/focal_loss_sigmoid_backward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/focal_loss_sigmoid_backward/focal_loss_sigmoid_backward.cpp @@ -52,11 +52,11 @@ class focal_loss_sigmoid_backward : public testing::Test { if (input) { if (input_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&input_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(input_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -72,11 +72,11 @@ class focal_loss_sigmoid_backward : public testing::Test { if (target) { if (target_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&target_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum(target_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&target_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -88,7 +88,7 @@ class focal_loss_sigmoid_backward : public testing::Test { weight_desc_dims.data())); if (weight) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&weight_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(weight_desc_))); } @@ -103,12 +103,12 @@ class focal_loss_sigmoid_backward : public testing::Test { if (grad_input) { if (grad_input_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_input_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(grad_input_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_input_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -140,7 +140,7 @@ class focal_loss_sigmoid_backward : public testing::Test { if (input_) { VLOG(4) << "Destroy input_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); input_ = nullptr; } @@ -152,7 +152,7 @@ class focal_loss_sigmoid_backward : public testing::Test { if (target_) { VLOG(4) << "Destroy target_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(target_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(target_)); target_ = nullptr; } @@ -164,7 +164,7 @@ class focal_loss_sigmoid_backward : public testing::Test { if (weight_) { VLOG(4) << "Destroy weight_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(weight_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(weight_)); weight_ = nullptr; } @@ -176,7 +176,7 @@ class focal_loss_sigmoid_backward : public testing::Test { if (grad_input_) { VLOG(4) << "Destroy grad_input_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_input_)); grad_input_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/focal_loss_sigmoid_backward/focal_loss_sigmoid_backward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/focal_loss_sigmoid_backward/focal_loss_sigmoid_backward_general.cpp index d8e56f4ba..ab7cdafbf 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/focal_loss_sigmoid_backward/focal_loss_sigmoid_backward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/focal_loss_sigmoid_backward/focal_loss_sigmoid_backward_general.cpp @@ -55,12 +55,12 @@ class focal_loss_sigmoid_backward_general input_params.get_dim_nb(), input_params.get_dim_size().data())); if (mluOpGetTensorElementNum(input_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&input_, mluOpDataTypeBytes(input_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&input_, mluOpDataTypeBytes(input_params.get_dtype()) * mluOpGetTensorElementNum(input_desc_))); } @@ -72,12 +72,12 @@ class focal_loss_sigmoid_backward_general target_params.get_dim_nb(), target_params.get_dim_size().data())); if (mluOpGetTensorElementNum(target_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&target_, mluOpDataTypeBytes(target_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&target_, mluOpDataTypeBytes(target_params.get_dtype()) * mluOpGetTensorElementNum(target_desc_))); } @@ -89,12 +89,12 @@ class focal_loss_sigmoid_backward_general weight_params.get_dim_nb(), weight_params.get_dim_size().data())); if (mluOpGetTensorElementNum(weight_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&weight_, mluOpDataTypeBytes(weight_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&weight_, mluOpDataTypeBytes(weight_params.get_dtype()) * mluOpGetTensorElementNum(weight_desc_))); } @@ -107,12 +107,12 @@ class focal_loss_sigmoid_backward_general grad_input_params.get_dim_size().data())); if (mluOpGetTensorElementNum(grad_input_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_input_, mluOpDataTypeBytes(grad_input_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_input_, mluOpDataTypeBytes(grad_input_params.get_dtype()) * mluOpGetTensorElementNum(grad_input_desc_))); @@ -157,7 +157,7 @@ class focal_loss_sigmoid_backward_general if (input_) { VLOG(4) << "Destroy input_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); input_ = nullptr; } @@ -169,7 +169,7 @@ class focal_loss_sigmoid_backward_general if (target_) { VLOG(4) << "Destroy target_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(target_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(target_)); target_ = nullptr; } @@ -181,7 +181,7 @@ class focal_loss_sigmoid_backward_general if (weight_) { VLOG(4) << "Destroy weight_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(weight_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(weight_)); weight_ = nullptr; } @@ -193,7 +193,7 @@ class focal_loss_sigmoid_backward_general if (grad_input_) { VLOG(4) << "Destroy grad_input_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_input_)); grad_input_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/focal_loss_sigmoid_forward/focal_loss_sigmoid_forward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/focal_loss_sigmoid_forward/focal_loss_sigmoid_forward.cpp index 3c675aa8c..73eaedc0f 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/focal_loss_sigmoid_forward/focal_loss_sigmoid_forward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/focal_loss_sigmoid_forward/focal_loss_sigmoid_forward.cpp @@ -51,11 +51,11 @@ class focal_loss_sigmoid_forward : public testing::Test { if (input) { if (input_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&input_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(input_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -71,11 +71,11 @@ class focal_loss_sigmoid_forward : public testing::Test { if (target) { if (target_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&target_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum(target_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&target_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -87,7 +87,7 @@ class focal_loss_sigmoid_forward : public testing::Test { weight_desc_dims.data())); if (weight) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&weight_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(weight_desc_))); } @@ -103,11 +103,11 @@ class focal_loss_sigmoid_forward : public testing::Test { if (output) { if (output_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&output_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(output_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -138,7 +138,7 @@ class focal_loss_sigmoid_forward : public testing::Test { if (input_) { VLOG(4) << "Destroy input_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); input_ = nullptr; } @@ -150,7 +150,7 @@ class focal_loss_sigmoid_forward : public testing::Test { if (target_) { VLOG(4) << "Destroy target_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(target_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(target_)); target_ = nullptr; } @@ -162,7 +162,7 @@ class focal_loss_sigmoid_forward : public testing::Test { if (weight_) { VLOG(4) << "Destroy weight_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(weight_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(weight_)); weight_ = nullptr; } @@ -174,7 +174,7 @@ class focal_loss_sigmoid_forward : public testing::Test { if (output_) { VLOG(4) << "Destroy output_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); output_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/focal_loss_sigmoid_forward/focal_loss_sigmoid_forward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/focal_loss_sigmoid_forward/focal_loss_sigmoid_forward_general.cpp index 7302bcce7..37c5824fb 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/focal_loss_sigmoid_forward/focal_loss_sigmoid_forward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/focal_loss_sigmoid_forward/focal_loss_sigmoid_forward_general.cpp @@ -55,12 +55,12 @@ class focal_loss_sigmoid_forward_general input_params.get_dim_nb(), input_params.get_dim_size().data())); if (mluOpGetTensorElementNum(input_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&input_, mluOpDataTypeBytes(input_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&input_, mluOpDataTypeBytes(input_params.get_dtype()) * mluOpGetTensorElementNum(input_desc_))); } @@ -72,12 +72,12 @@ class focal_loss_sigmoid_forward_general target_params.get_dim_nb(), target_params.get_dim_size().data())); if (mluOpGetTensorElementNum(target_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&target_, mluOpDataTypeBytes(target_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&target_, mluOpDataTypeBytes(target_params.get_dtype()) * mluOpGetTensorElementNum(target_desc_))); } @@ -89,12 +89,12 @@ class focal_loss_sigmoid_forward_general weight_params.get_dim_nb(), weight_params.get_dim_size().data())); if (mluOpGetTensorElementNum(weight_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&weight_, mluOpDataTypeBytes(weight_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&weight_, mluOpDataTypeBytes(weight_params.get_dtype()) * mluOpGetTensorElementNum(weight_desc_))); } @@ -106,12 +106,12 @@ class focal_loss_sigmoid_forward_general output_params.get_dim_nb(), output_params.get_dim_size().data())); if (mluOpGetTensorElementNum(output_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&output_, mluOpDataTypeBytes(output_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&output_, mluOpDataTypeBytes(output_params.get_dtype()) * mluOpGetTensorElementNum(output_desc_))); } @@ -154,7 +154,7 @@ class focal_loss_sigmoid_forward_general if (input_) { VLOG(4) << "Destroy input_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); input_ = nullptr; } @@ -166,7 +166,7 @@ class focal_loss_sigmoid_forward_general if (target_) { VLOG(4) << "Destroy target_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(target_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(target_)); target_ = nullptr; } @@ -178,7 +178,7 @@ class focal_loss_sigmoid_forward_general if (weight_) { VLOG(4) << "Destroy weight_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(weight_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(weight_)); weight_ = nullptr; } @@ -190,7 +190,7 @@ class focal_loss_sigmoid_forward_general if (output_) { VLOG(4) << "Destroy output_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); output_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/generate_proposals_v2/generate_proposals_v2.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/generate_proposals_v2/generate_proposals_v2.cpp index 2080d101b..492860751 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/generate_proposals_v2/generate_proposals_v2.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/generate_proposals_v2/generate_proposals_v2.cpp @@ -56,7 +56,7 @@ class generate_proposals_v2 : public testing::Test { size_t scores_ele_num = 1 * 5 * 5 * 9; size_t scores_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t scores_bytes = scores_ele_num * scores_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&scores_, scores_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&scores_, scores_bytes)); } if (bbox_deltas_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&bbox_deltas_desc_)); @@ -69,7 +69,7 @@ class generate_proposals_v2 : public testing::Test { size_t bbox_deltas_ele_num = 2 * 32 * 16 * 16; size_t bbox_deltas_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t bbox_deltas_bytes = bbox_deltas_ele_num * bbox_deltas_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&bbox_deltas_, bbox_deltas_bytes)); } if (im_shape_desc) { @@ -83,7 +83,7 @@ class generate_proposals_v2 : public testing::Test { size_t im_shape_ele_num = 2 * 2; size_t im_shape_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t im_shape_bytes = im_shape_ele_num * im_shape_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&im_shape_, im_shape_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&im_shape_, im_shape_bytes)); } if (anchors_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&anchors_desc_)); @@ -96,7 +96,7 @@ class generate_proposals_v2 : public testing::Test { size_t anchors_ele_num = 8 * 16 * 16 * 4; size_t anchors_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t anchors_bytes = anchors_ele_num * anchors_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&anchors_, anchors_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&anchors_, anchors_bytes)); } if (variances_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&variances_desc_)); @@ -109,13 +109,13 @@ class generate_proposals_v2 : public testing::Test { size_t variances_ele_num = 8 * 16 * 16 * 4; size_t variances_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t variances_bytes = variances_ele_num * variances_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&variances_, variances_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&variances_, variances_bytes)); } if (workspace) { size_t workspace_ele_num = workspace_size_; size_t workspace_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t workspace_bytes = workspace_ele_num * workspace_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_bytes)); } if (rpn_rois_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&rpn_rois_desc_)); @@ -128,7 +128,7 @@ class generate_proposals_v2 : public testing::Test { size_t rpn_rois_ele_num = 5 * 4; size_t rpn_rois_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t rpn_rois_bytes = rpn_rois_ele_num * rpn_rois_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&rpn_rois_, rpn_rois_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&rpn_rois_, rpn_rois_bytes)); } if (rpn_roi_probs_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&rpn_roi_probs_desc_)); @@ -142,7 +142,7 @@ class generate_proposals_v2 : public testing::Test { size_t rpn_roi_probs_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t rpn_roi_probs_bytes = rpn_roi_probs_ele_num * rpn_roi_probs_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&rpn_roi_probs_, rpn_roi_probs_bytes)); } if (rpn_rois_num_desc) { @@ -157,7 +157,7 @@ class generate_proposals_v2 : public testing::Test { size_t rpn_rois_num_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_INT32); size_t rpn_rois_num_bytes = rpn_rois_num_ele_num * rpn_rois_num_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&rpn_rois_num_, rpn_rois_num_bytes)); } if (rpn_rois_batch_size) { @@ -166,7 +166,7 @@ class generate_proposals_v2 : public testing::Test { mluOpDataTypeBytes(MLUOP_DTYPE_INT32); size_t rpn_rois_batch_size_bytes = rpn_rois_batch_size_ele_num * rpn_rois_batch_size_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&rpn_rois_batch_size_, rpn_rois_batch_size_bytes)); } } @@ -195,7 +195,7 @@ class generate_proposals_v2 : public testing::Test { scores_desc_ = NULL; } if (scores_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(scores_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(scores_)); scores_ = NULL; } if (bbox_deltas_desc_) { @@ -203,7 +203,7 @@ class generate_proposals_v2 : public testing::Test { bbox_deltas_desc_ = NULL; } if (bbox_deltas_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(bbox_deltas_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(bbox_deltas_)); bbox_deltas_ = NULL; } if (im_shape_desc_) { @@ -211,7 +211,7 @@ class generate_proposals_v2 : public testing::Test { im_shape_desc_ = NULL; } if (im_shape_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(im_shape_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(im_shape_)); im_shape_ = NULL; } if (anchors_desc_) { @@ -219,7 +219,7 @@ class generate_proposals_v2 : public testing::Test { anchors_desc_ = NULL; } if (anchors_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(anchors_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(anchors_)); anchors_ = NULL; } if (variances_desc_) { @@ -227,11 +227,11 @@ class generate_proposals_v2 : public testing::Test { variances_desc_ = NULL; } if (variances_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(variances_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(variances_)); variances_ = NULL; } if (workspace_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = NULL; } if (rpn_rois_desc_) { @@ -239,7 +239,7 @@ class generate_proposals_v2 : public testing::Test { rpn_rois_desc_ = NULL; } if (rpn_rois_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(rpn_rois_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(rpn_rois_)); rpn_rois_ = NULL; } if (rpn_roi_probs_desc_) { @@ -247,7 +247,7 @@ class generate_proposals_v2 : public testing::Test { rpn_roi_probs_desc_ = NULL; } if (rpn_roi_probs_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(rpn_roi_probs_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(rpn_roi_probs_)); rpn_roi_probs_ = NULL; } if (rpn_rois_num_desc_) { @@ -255,11 +255,11 @@ class generate_proposals_v2 : public testing::Test { rpn_rois_num_desc_ = NULL; } if (rpn_rois_num_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(rpn_rois_num_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(rpn_rois_num_)); rpn_rois_num_ = NULL; } if (rpn_rois_batch_size_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(rpn_rois_batch_size_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(rpn_rois_batch_size_)); rpn_rois_batch_size_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/generate_proposals_v2/generate_proposals_v2_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/generate_proposals_v2/generate_proposals_v2_general.cpp index 8747918fb..4a9e9eb5c 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/generate_proposals_v2/generate_proposals_v2_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/generate_proposals_v2/generate_proposals_v2_general.cpp @@ -62,7 +62,7 @@ class generate_proposals_v2_general uint64_t scores_ele_num = mluOpGetTensorElementNum(scores_desc_); uint64_t scores_bytes = mluOpDataTypeBytes(scores_dtype) * scores_ele_num; if (scores_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&scores_, scores_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&scores_, scores_bytes)) } MLUOP_CHECK(mluOpCreateTensorDescriptor(&bbox_deltas_desc_)); @@ -78,7 +78,7 @@ class generate_proposals_v2_general uint64_t bbox_deltas_bytes = mluOpDataTypeBytes(bbox_deltas_dtype) * bbox_deltas_ele_num; if (bbox_deltas_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&bbox_deltas_, bbox_deltas_bytes)) } @@ -95,7 +95,7 @@ class generate_proposals_v2_general uint64_t im_shape_bytes = mluOpDataTypeBytes(im_shape_dtype) * im_shape_ele_num; if (im_shape_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&im_shape_, im_shape_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&im_shape_, im_shape_bytes)) } MLUOP_CHECK(mluOpCreateTensorDescriptor(&anchors_desc_)); @@ -111,7 +111,7 @@ class generate_proposals_v2_general uint64_t anchors_bytes = mluOpDataTypeBytes(anchors_dtype) * anchors_ele_num; if (anchors_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&anchors_, anchors_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&anchors_, anchors_bytes)) } MLUOP_CHECK(mluOpCreateTensorDescriptor(&variances_desc_)); @@ -127,7 +127,7 @@ class generate_proposals_v2_general uint64_t variances_bytes = mluOpDataTypeBytes(variances_dtype) * variances_ele_num; if (variances_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&variances_, variances_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&variances_, variances_bytes)) } MLUOP_CHECK(mluOpCreateTensorDescriptor(&rpn_rois_desc_)); @@ -143,7 +143,7 @@ class generate_proposals_v2_general uint64_t rpn_rois_bytes = mluOpDataTypeBytes(rpn_rois_dtype) * rpn_rois_ele_num; if (rpn_rois_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&rpn_rois_, rpn_rois_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&rpn_rois_, rpn_rois_bytes)) } MLUOP_CHECK(mluOpCreateTensorDescriptor(&rpn_roi_probs_desc_)); @@ -162,7 +162,7 @@ class generate_proposals_v2_general uint64_t rpn_roi_probs_bytes = mluOpDataTypeBytes(rpn_roi_probs_dtype) * rpn_roi_probs_ele_num; if (rpn_roi_probs_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&rpn_roi_probs_, rpn_roi_probs_bytes)) } @@ -180,11 +180,11 @@ class generate_proposals_v2_general uint64_t rpn_rois_num_bytes = mluOpDataTypeBytes(rpn_rois_num_dtype) * rpn_rois_num_ele_num; if (rpn_rois_num_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&rpn_rois_num_, rpn_rois_num_bytes)) } - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&rpn_rois_batch_size_, mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); @@ -208,7 +208,7 @@ class generate_proposals_v2_general destroy(); return status == expected_status_; } - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)); status = mluOpGenerateProposalsV2( handle_, pre_nms_top_n_, post_nms_top_n_, nms_thresh_, min_size_, eta_, pixel_offset_, scores_desc_, scores_, bbox_deltas_desc_, bbox_deltas_, @@ -232,7 +232,7 @@ class generate_proposals_v2_general scores_desc_ = NULL; } if (scores_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(scores_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(scores_)); scores_ = NULL; } if (bbox_deltas_desc_) { @@ -240,7 +240,7 @@ class generate_proposals_v2_general bbox_deltas_desc_ = NULL; } if (bbox_deltas_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(bbox_deltas_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(bbox_deltas_)); bbox_deltas_ = NULL; } if (im_shape_desc_) { @@ -248,7 +248,7 @@ class generate_proposals_v2_general im_shape_desc_ = NULL; } if (im_shape_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(im_shape_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(im_shape_)); im_shape_ = NULL; } if (anchors_desc_) { @@ -256,7 +256,7 @@ class generate_proposals_v2_general anchors_desc_ = NULL; } if (anchors_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(anchors_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(anchors_)); anchors_ = NULL; } if (variances_desc_) { @@ -264,11 +264,11 @@ class generate_proposals_v2_general variances_desc_ = NULL; } if (variances_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(variances_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(variances_)); variances_ = NULL; } if (workspace_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = NULL; } if (rpn_rois_desc_) { @@ -276,7 +276,7 @@ class generate_proposals_v2_general rpn_rois_desc_ = NULL; } if (rpn_rois_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(rpn_rois_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(rpn_rois_)); rpn_rois_ = NULL; } if (rpn_roi_probs_desc_) { @@ -284,7 +284,7 @@ class generate_proposals_v2_general rpn_roi_probs_desc_ = NULL; } if (rpn_roi_probs_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(rpn_roi_probs_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(rpn_roi_probs_)); rpn_roi_probs_ = NULL; } if (rpn_rois_num_desc_) { @@ -292,11 +292,11 @@ class generate_proposals_v2_general rpn_rois_num_desc_ = NULL; } if (rpn_rois_num_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(rpn_rois_num_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(rpn_rois_num_)); rpn_rois_num_ = NULL; } if (rpn_rois_batch_size_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(rpn_rois_batch_size_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(rpn_rois_batch_size_)); rpn_rois_batch_size_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/get_indice_pairs/get_indice_pairs.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/get_indice_pairs/get_indice_pairs.cpp index 691c59dc8..bd3ef6511 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/get_indice_pairs/get_indice_pairs.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/get_indice_pairs/get_indice_pairs.cpp @@ -64,17 +64,17 @@ class get_indice_pairs : public testing::Test { if (indices) { if (indices_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&indices_, mluOpGetTensorElementNum(indices_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&indices_, 4 * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } } if (workspace) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)); } if (indice_pairs_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&indice_pairs_desc_)); @@ -85,12 +85,12 @@ class get_indice_pairs : public testing::Test { } if (indice_pairs) { if (indice_pairs_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indice_pairs_, mluOpGetTensorElementNum(indice_pairs_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indice_pairs_, 54 * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } @@ -104,12 +104,12 @@ class get_indice_pairs : public testing::Test { } if (out_indices) { if (out_indices_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&out_indices_, mluOpGetTensorElementNum(out_indices_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&out_indices_, 180 * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } @@ -123,12 +123,12 @@ class get_indice_pairs : public testing::Test { } if (indice_num) { if (indice_num_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indice_num_, mluOpGetTensorElementNum(indice_num_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indice_num_, 27 * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } @@ -161,11 +161,11 @@ class get_indice_pairs : public testing::Test { indices_desc_ = NULL; } if (indices_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indices_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indices_)); indices_ = NULL; } if (workspace_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } if (indice_pairs_desc_) { @@ -173,7 +173,7 @@ class get_indice_pairs : public testing::Test { indice_pairs_desc_ = NULL; } if (indice_pairs_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indice_pairs_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indice_pairs_)); indice_pairs_ = NULL; } if (out_indices_desc_) { @@ -181,7 +181,7 @@ class get_indice_pairs : public testing::Test { out_indices_desc_ = NULL; } if (out_indices_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(out_indices_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(out_indices_)); out_indices_ = NULL; } if (indice_num_desc_) { @@ -189,7 +189,7 @@ class get_indice_pairs : public testing::Test { indice_num_desc_ = NULL; } if (indice_num_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indice_num_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indice_num_)); indice_num_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/get_indice_pairs/get_indice_pairs_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/get_indice_pairs/get_indice_pairs_general.cpp index c4275f825..f51adb7c6 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/get_indice_pairs/get_indice_pairs_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/get_indice_pairs/get_indice_pairs_general.cpp @@ -85,11 +85,11 @@ class get_indice_pairs_general : public testing::TestWithParam { uint64_t indices_ele_num = mluOpGetTensorElementNum(indices_desc_); if (indices_ele_num > 0) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&indices_, mluOpGetTensorElementNum(indices_desc_) * mluOpDataTypeBytes(indices_dtype))) } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indices_, 4 * mluOpDataTypeBytes(indices_dtype))); } @@ -105,12 +105,12 @@ class get_indice_pairs_general : public testing::TestWithParam { uint64_t indice_pairs_ele_num = mluOpGetTensorElementNum(indice_pairs_desc_); if (indice_pairs_ele_num > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indice_pairs_, mluOpGetTensorElementNum(indice_pairs_desc_) * mluOpDataTypeBytes(indice_pairs_dtype))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indice_pairs_, 54 * mluOpDataTypeBytes(indice_pairs_dtype))); } @@ -126,12 +126,12 @@ class get_indice_pairs_general : public testing::TestWithParam { out_indices_shape.data())); uint64_t out_indices_ele_num = mluOpGetTensorElementNum(out_indices_desc_); if (out_indices_ele_num > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&out_indices_, mluOpGetTensorElementNum(out_indices_desc_) * mluOpDataTypeBytes(out_indices_dtype))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&out_indices_, 108 * mluOpDataTypeBytes(out_indices_dtype))); } @@ -148,12 +148,12 @@ class get_indice_pairs_general : public testing::TestWithParam { uint64_t indice_num_ele_num = mluOpGetTensorElementNum(indice_num_desc_); if (indice_num_ele_num > 0) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&indice_num_, mluOpGetTensorElementNum(indice_num_desc_) * mluOpDataTypeBytes(indice_num_dtype))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&indice_num_, 27 * mluOpDataTypeBytes(indice_num_dtype))); } } @@ -186,7 +186,7 @@ class get_indice_pairs_general : public testing::TestWithParam { destroy(); return expected_status_ == status; } - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)); status = mluOpGetIndicePairs( handle_, sparse_conv_desc_, indices_desc_, indices_, workspace_, @@ -213,11 +213,11 @@ class get_indice_pairs_general : public testing::TestWithParam { indices_desc_ = NULL; } if (indices_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indices_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indices_)); indices_ = NULL; } if (workspace_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } if (indice_pairs_desc_) { @@ -225,7 +225,7 @@ class get_indice_pairs_general : public testing::TestWithParam { indice_pairs_desc_ = NULL; } if (indice_pairs_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indice_pairs_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indice_pairs_)); indice_pairs_ = NULL; } if (out_indices_desc_) { @@ -233,7 +233,7 @@ class get_indice_pairs_general : public testing::TestWithParam { out_indices_desc_ = NULL; } if (out_indices_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(out_indices_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(out_indices_)); out_indices_ = NULL; } if (indice_num_desc_) { @@ -241,7 +241,7 @@ class get_indice_pairs_general : public testing::TestWithParam { indice_num_desc_ = NULL; } if (indice_num_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indice_num_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indice_num_)); indice_num_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_backward_data/indice_convolution_backward_data.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_backward_data/indice_convolution_backward_data.cpp index 56d7ea113..ef03eb4b2 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_backward_data/indice_convolution_backward_data.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_backward_data/indice_convolution_backward_data.cpp @@ -50,12 +50,12 @@ class indice_convolution_backward_data : public testing::Test { } if (output_grad) { if (output_grad_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_grad_, mluOpGetTensorElementNum(output_grad_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_grad_, 100 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } @@ -70,11 +70,11 @@ class indice_convolution_backward_data : public testing::Test { if (filters) { if (filters_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&filters_, mluOpGetTensorElementNum(filters_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&filters_, 1890 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } @@ -88,12 +88,12 @@ class indice_convolution_backward_data : public testing::Test { } if (indice_pairs) { if (indice_pairs_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indice_pairs_, mluOpGetTensorElementNum(indice_pairs_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indice_pairs_, 180 * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } @@ -107,18 +107,18 @@ class indice_convolution_backward_data : public testing::Test { } if (input_grad) { if (input_grad_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_grad_, mluOpGetTensorElementNum(input_grad_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_grad_, 210 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } } if (workspace) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)); } } @@ -144,7 +144,7 @@ class indice_convolution_backward_data : public testing::Test { output_grad_desc_ = NULL; } if (output_grad_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_grad_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_grad_)); output_grad_ = NULL; } if (filters_desc_) { @@ -152,7 +152,7 @@ class indice_convolution_backward_data : public testing::Test { filters_desc_ = NULL; } if (filters_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(filters_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(filters_)); filters_ = NULL; } if (indice_pairs_desc_) { @@ -160,11 +160,11 @@ class indice_convolution_backward_data : public testing::Test { indice_pairs_desc_ = NULL; } if (indice_pairs_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indice_pairs_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indice_pairs_)); indice_pairs_ = NULL; } if (workspace_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } if (input_grad_desc_) { @@ -172,7 +172,7 @@ class indice_convolution_backward_data : public testing::Test { input_grad_desc_ = NULL; } if (input_grad_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_grad_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_grad_)); input_grad_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_backward_data/indice_convolution_backward_data_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_backward_data/indice_convolution_backward_data_general.cpp index 09b17ed2b..3484694a0 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_backward_data/indice_convolution_backward_data_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_backward_data/indice_convolution_backward_data_general.cpp @@ -65,7 +65,7 @@ class indice_convolution_backward_data_general output_grad_shape.data())); uint64_t output_grad_ele_num = mluOpGetTensorElementNum(output_grad_desc_); if (output_grad_ele_num > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_grad_, mluOpGetTensorElementNum(output_grad_desc_) * mluOpDataTypeBytes(output_grad_dtype))) @@ -83,7 +83,7 @@ class indice_convolution_backward_data_general uint64_t filters_ele_num = mluOpGetTensorElementNum(filters_desc_); if (filters_ele_num > 0) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&filters_, mluOpGetTensorElementNum(filters_desc_) * mluOpDataTypeBytes(filters_dtype))); } @@ -100,7 +100,7 @@ class indice_convolution_backward_data_general uint64_t indice_pairs_ele_num = mluOpGetTensorElementNum(indice_pairs_desc_); if (indice_pairs_ele_num > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indice_pairs_, mluOpGetTensorElementNum(indice_pairs_desc_) * mluOpDataTypeBytes(indice_pairs_dtype))); @@ -118,7 +118,7 @@ class indice_convolution_backward_data_general uint64_t input_grad_ele_num = mluOpGetTensorElementNum(input_grad_desc_); if (input_grad_ele_num > 0) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&input_grad_, mluOpGetTensorElementNum(input_grad_desc_) * mluOpDataTypeBytes(input_grad_dtype))); } @@ -141,7 +141,7 @@ class indice_convolution_backward_data_general destroy(); return expected_status_ == status; } - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)); status = mluOpIndiceConvolutionBackwardData( handle_, output_grad_desc_, output_grad_, filters_desc_, filters_, indice_pairs_desc_, indice_pairs_, indice_num_.data(), inverse_, sub_m_, @@ -163,7 +163,7 @@ class indice_convolution_backward_data_general output_grad_desc_ = NULL; } if (output_grad_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_grad_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_grad_)); output_grad_ = NULL; } if (filters_desc_) { @@ -171,7 +171,7 @@ class indice_convolution_backward_data_general filters_desc_ = NULL; } if (filters_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(filters_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(filters_)); filters_ = NULL; } if (indice_pairs_desc_) { @@ -179,11 +179,11 @@ class indice_convolution_backward_data_general indice_pairs_desc_ = NULL; } if (indice_pairs_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indice_pairs_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indice_pairs_)); indice_pairs_ = NULL; } if (workspace_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = NULL; } if (input_grad_desc_) { @@ -191,7 +191,7 @@ class indice_convolution_backward_data_general input_grad_desc_ = NULL; } if (input_grad_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_grad_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_grad_)); input_grad_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp index d218e4436..5c121e882 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp @@ -53,12 +53,12 @@ class indice_convolution_backward_filter : public testing::Test { if (features) { if (features_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&features_, mluOpGetTensorElementNum(features_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&features_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } } @@ -73,12 +73,12 @@ class indice_convolution_backward_filter : public testing::Test { if (output_grad) { if (output_grad_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_grad_, mluOpGetTensorElementNum(output_grad_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_grad_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } @@ -94,12 +94,12 @@ class indice_convolution_backward_filter : public testing::Test { if (indice_pairs) { if (indice_pairs_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indice_pairs_, mluOpGetTensorElementNum(indice_pairs_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indice_pairs_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } @@ -115,12 +115,12 @@ class indice_convolution_backward_filter : public testing::Test { if (filters_grad) { if (filters_grad_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&filters_grad_, mluOpGetTensorElementNum(filters_grad_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&filters_grad_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } @@ -134,7 +134,7 @@ class indice_convolution_backward_filter : public testing::Test { } if (worksapce) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)); } } mluOpStatus_t compute() { @@ -163,7 +163,7 @@ class indice_convolution_backward_filter : public testing::Test { if (features_) { VLOG(4) << "Destroy features"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(features_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(features_)); features_ = nullptr; } @@ -175,7 +175,7 @@ class indice_convolution_backward_filter : public testing::Test { if (output_grad_) { VLOG(4) << "Destroy output_grad"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_grad_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_grad_)); output_grad_ = nullptr; } @@ -187,13 +187,13 @@ class indice_convolution_backward_filter : public testing::Test { if (indice_pairs_) { VLOG(4) << "Destroy indice_pairs"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indice_pairs_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indice_pairs_)); indice_pairs_ = nullptr; } if (workspace_) { VLOG(4) << "Destroy workspace"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } @@ -205,7 +205,7 @@ class indice_convolution_backward_filter : public testing::Test { if (filters_grad_) { VLOG(4) << "Destroy filters_grad"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(filters_grad_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(filters_grad_)); filters_grad_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_backward_filter/indice_convolution_backward_filter_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_backward_filter/indice_convolution_backward_filter_general.cpp index 04969b063..b644b64ac 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_backward_filter/indice_convolution_backward_filter_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_backward_filter/indice_convolution_backward_filter_general.cpp @@ -51,11 +51,11 @@ class indice_convolution_backward_filter_general features_params.get_dim_size().data())); if (mluOpGetTensorElementNum(features_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&features_, mluOpDataTypeBytes(features_params.get_dtype()) * 10)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&features_, mluOpDataTypeBytes(features_params.get_dtype()) * mluOpGetTensorElementNum(features_desc_))); @@ -69,13 +69,13 @@ class indice_convolution_backward_filter_general output_grad_params.get_dim_size().data())); if (mluOpGetTensorElementNum(output_grad_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &output_grad_, mluOpDataTypeBytes(output_grad_params.get_dtype()) * 10)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&output_grad_, mluOpDataTypeBytes(output_grad_params.get_dtype()) * mluOpGetTensorElementNum(output_grad_desc_))); @@ -89,13 +89,13 @@ class indice_convolution_backward_filter_general indice_pairs_params.get_dim_size().data())); if (mluOpGetTensorElementNum(indice_pairs_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &indice_pairs_, mluOpDataTypeBytes(indice_pairs_params.get_dtype()) * 10)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&indice_pairs_, mluOpDataTypeBytes(indice_pairs_params.get_dtype()) * mluOpGetTensorElementNum(indice_pairs_desc_))); @@ -109,13 +109,13 @@ class indice_convolution_backward_filter_general filters_grad_params.get_dim_size().data())); if (mluOpGetTensorElementNum(filters_grad_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &filters_grad_, mluOpDataTypeBytes(filters_grad_params.get_dtype()) * 10)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&filters_grad_, mluOpDataTypeBytes(filters_grad_params.get_dtype()) * mluOpGetTensorElementNum(filters_grad_desc_))); @@ -149,7 +149,7 @@ class indice_convolution_backward_filter_general destroy(); return expected_status_ == status; } - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)); status = mluOpIndiceConvolutionBackwardFilter( handle_, features_desc_, features_, output_grad_desc_, output_grad_, @@ -175,7 +175,7 @@ class indice_convolution_backward_filter_general if (features_) { VLOG(4) << "Destroy features"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(features_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(features_)); features_ = nullptr; } @@ -187,7 +187,7 @@ class indice_convolution_backward_filter_general if (output_grad_) { VLOG(4) << "Destroy output_grad"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_grad_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_grad_)); output_grad_ = nullptr; } @@ -199,7 +199,7 @@ class indice_convolution_backward_filter_general if (indice_pairs_) { VLOG(4) << "Destroy indice_pairs"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indice_pairs_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indice_pairs_)); indice_pairs_ = nullptr; } @@ -211,7 +211,7 @@ class indice_convolution_backward_filter_general if (filters_grad_) { VLOG(4) << "Destroy filters_grad"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(filters_grad_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(filters_grad_)); filters_grad_ = nullptr; } } catch (const std::exception &e) { diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_forward/indice_convolution_forward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_forward/indice_convolution_forward.cpp index 0900627a0..8bf8201bf 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_forward/indice_convolution_forward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_forward/indice_convolution_forward.cpp @@ -53,12 +53,12 @@ class indice_convolution_forward : public testing::Test { if (features) { if (features_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&features_, mluOpGetTensorElementNum(features_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&features_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } } @@ -74,12 +74,12 @@ class indice_convolution_forward : public testing::Test { if (filters) { if (filters_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&filters_, mluOpGetTensorElementNum(filters_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&filters_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } } @@ -94,12 +94,12 @@ class indice_convolution_forward : public testing::Test { if (indice_pairs) { if (indice_pairs_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indice_pairs_, mluOpGetTensorElementNum(indice_pairs_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indice_pairs_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } @@ -115,12 +115,12 @@ class indice_convolution_forward : public testing::Test { if (features_out) { if (features_out_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&features_out_, mluOpGetTensorElementNum(features_out_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&features_out_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } @@ -134,7 +134,7 @@ class indice_convolution_forward : public testing::Test { } if (worksapce) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)); } } mluOpStatus_t compute() { @@ -164,7 +164,7 @@ class indice_convolution_forward : public testing::Test { if (features_) { VLOG(4) << "Destroy features"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(features_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(features_)); features_ = nullptr; } @@ -176,7 +176,7 @@ class indice_convolution_forward : public testing::Test { if (filters_) { VLOG(4) << "Destroy filters"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(filters_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(filters_)); filters_ = nullptr; } @@ -188,13 +188,13 @@ class indice_convolution_forward : public testing::Test { if (indice_pairs_) { VLOG(4) << "Destroy indice_pairs"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indice_pairs_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indice_pairs_)); indice_pairs_ = nullptr; } if (workspace_) { VLOG(4) << "Destroy workspace"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } @@ -206,7 +206,7 @@ class indice_convolution_forward : public testing::Test { if (features_out_) { VLOG(4) << "Destroy features_out"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(features_out_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(features_out_)); features_out_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_forward/indice_convolution_forward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_forward/indice_convolution_forward_general.cpp index 1ecb2439a..1ace5ba40 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_forward/indice_convolution_forward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/indice_convolution_forward/indice_convolution_forward_general.cpp @@ -54,11 +54,11 @@ class indice_convolution_forward_general features_params.get_dim_size().data())); if (mluOpGetTensorElementNum(features_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&features_, mluOpDataTypeBytes(features_params.get_dtype()) * 10)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&features_, mluOpDataTypeBytes(features_params.get_dtype()) * mluOpGetTensorElementNum(features_desc_))); @@ -72,11 +72,11 @@ class indice_convolution_forward_general filters_params.get_dim_size().data())); if (mluOpGetTensorElementNum(filters_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&filters_, mluOpDataTypeBytes(filters_params.get_dtype()) * 10)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&filters_, mluOpDataTypeBytes(filters_params.get_dtype()) * mluOpGetTensorElementNum(filters_desc_))); @@ -90,13 +90,13 @@ class indice_convolution_forward_general indice_pairs_params.get_dim_size().data())); if (mluOpGetTensorElementNum(indice_pairs_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &indice_pairs_, mluOpDataTypeBytes(indice_pairs_params.get_dtype()) * 10)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&indice_pairs_, mluOpDataTypeBytes(indice_pairs_params.get_dtype()) * mluOpGetTensorElementNum(indice_pairs_desc_))); @@ -110,13 +110,13 @@ class indice_convolution_forward_general features_out_params.get_dim_size().data())); if (mluOpGetTensorElementNum(features_out_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &features_out_, mluOpDataTypeBytes(features_out_params.get_dtype()) * 10)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&features_out_, mluOpDataTypeBytes(features_out_params.get_dtype()) * mluOpGetTensorElementNum(features_out_desc_))); @@ -149,7 +149,7 @@ class indice_convolution_forward_general destroy(); return expected_status_ == status; } - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)); status = mluOpIndiceConvolutionForward( handle_, features_desc_, features_, filters_desc_, filters_, @@ -176,7 +176,7 @@ class indice_convolution_forward_general if (features_) { VLOG(4) << "Destroy features"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(features_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(features_)); features_ = nullptr; } @@ -188,7 +188,7 @@ class indice_convolution_forward_general if (filters_) { VLOG(4) << "Destroy filters"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(filters_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(filters_)); filters_ = nullptr; } @@ -200,7 +200,7 @@ class indice_convolution_forward_general if (indice_pairs_) { VLOG(4) << "Destroy indice_pairs"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indice_pairs_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indice_pairs_)); indice_pairs_ = nullptr; } @@ -212,7 +212,7 @@ class indice_convolution_forward_general if (features_out_) { VLOG(4) << "Destroy features_out"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(features_out_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(features_out_)); features_out_ = nullptr; } } catch (const std::exception &e) { diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/masked_col2im_forward/masked_col2im_forward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/masked_col2im_forward/masked_col2im_forward.cpp index 8e317ef63..5bb2d4fff 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/masked_col2im_forward/masked_col2im_forward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/masked_col2im_forward/masked_col2im_forward.cpp @@ -51,11 +51,11 @@ class masked_col2im_forward : public testing::Test { if (col) { if (col_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&col_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(col_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&col_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -70,12 +70,12 @@ class masked_col2im_forward : public testing::Test { if (mask_h_idx) { if (mask_h_idx_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&mask_h_idx_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum(mask_h_idx_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&mask_h_idx_, MLUOP_DTYPE_INT32 * 2)); } } @@ -90,12 +90,12 @@ class masked_col2im_forward : public testing::Test { if (mask_w_idx) { if (mask_w_idx_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&mask_w_idx_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum(mask_w_idx_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&mask_w_idx_, MLUOP_DTYPE_INT32 * 2)); } } @@ -110,17 +110,17 @@ class masked_col2im_forward : public testing::Test { if (im) { if (im_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&im_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(im_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&im_, MLUOP_DTYPE_FLOAT * 2)); } } if (workspace) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, MLUOP_DTYPE_FLOAT * workspace_size_)); } } @@ -151,7 +151,7 @@ class masked_col2im_forward : public testing::Test { if (col_) { VLOG(4) << "Destroy col_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(col_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(col_)); col_ = nullptr; } @@ -163,7 +163,7 @@ class masked_col2im_forward : public testing::Test { if (mask_h_idx_) { VLOG(4) << "Destroy mask_h_idx_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(mask_h_idx_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(mask_h_idx_)); mask_h_idx_ = nullptr; } @@ -175,7 +175,7 @@ class masked_col2im_forward : public testing::Test { if (mask_w_idx_) { VLOG(4) << "Destroy mask_w_idx_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(mask_w_idx_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(mask_w_idx_)); mask_w_idx_ = nullptr; } @@ -187,13 +187,13 @@ class masked_col2im_forward : public testing::Test { if (im_) { VLOG(4) << "Destroy im_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(im_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(im_)); im_ = nullptr; } if (workspace_) { VLOG(4) << "Destroy workspace_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/masked_col2im_forward/masked_col2im_forward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/masked_col2im_forward/masked_col2im_forward_general.cpp index cee47c4c1..0484b0a6c 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/masked_col2im_forward/masked_col2im_forward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/masked_col2im_forward/masked_col2im_forward_general.cpp @@ -51,11 +51,11 @@ class masked_col2im_forward_general col_params.get_dim_nb(), col_params.get_dim_size().data())); if (mluOpGetTensorElementNum(col_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&col_, mluOpDataTypeBytes(col_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&col_, mluOpDataTypeBytes(col_params.get_dtype()) * mluOpGetTensorElementNum(col_desc_))); } @@ -68,12 +68,12 @@ class masked_col2im_forward_general mask_h_idx_params.get_dim_size().data())); if (mluOpGetTensorElementNum(mask_h_idx_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&mask_h_idx_, mluOpDataTypeBytes(mask_h_idx_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&mask_h_idx_, mluOpDataTypeBytes(mask_h_idx_params.get_dtype()) * mluOpGetTensorElementNum(mask_h_idx_desc_))); @@ -87,12 +87,12 @@ class masked_col2im_forward_general mask_w_idx_params.get_dim_size().data())); if (mluOpGetTensorElementNum(mask_w_idx_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&mask_w_idx_, mluOpDataTypeBytes(mask_w_idx_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&mask_w_idx_, mluOpDataTypeBytes(mask_w_idx_params.get_dtype()) * mluOpGetTensorElementNum(mask_w_idx_desc_))); @@ -105,10 +105,10 @@ class masked_col2im_forward_general im_params.get_dim_nb(), im_params.get_dim_size().data())); if (mluOpGetTensorElementNum(im_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&im_, mluOpDataTypeBytes(im_params.get_dtype()) * 2)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&im_, mluOpDataTypeBytes(im_params.get_dtype()) * mluOpGetTensorElementNum(im_desc_))); } @@ -116,7 +116,7 @@ class masked_col2im_forward_general target_device_ = std::get<4>(GetParam()); expected_status_ = std::get<5>(GetParam()); - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, MLUOP_DTYPE_FLOAT * workspace_size_)); } catch (const std::exception &e) { FAIL() << "MLUOPAPIGTEST: catched " << e.what() @@ -154,7 +154,7 @@ class masked_col2im_forward_general if (col_) { VLOG(4) << "Destroy col_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(col_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(col_)); col_ = nullptr; } @@ -166,7 +166,7 @@ class masked_col2im_forward_general if (mask_h_idx_) { VLOG(4) << "Destroy mask_h_idx_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(mask_h_idx_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(mask_h_idx_)); mask_h_idx_ = nullptr; } @@ -178,7 +178,7 @@ class masked_col2im_forward_general if (mask_w_idx_) { VLOG(4) << "Destroy mask_w_idx_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(mask_w_idx_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(mask_w_idx_)); mask_w_idx_ = nullptr; } @@ -190,13 +190,13 @@ class masked_col2im_forward_general if (im_) { VLOG(4) << "Destroy im_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(im_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(im_)); im_ = nullptr; } if (workspace_) { VLOG(4) << "Destroy workspace_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/masked_im2col_forward/masked_im2col_forward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/masked_im2col_forward/masked_im2col_forward.cpp index 7d1735986..69087f80a 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/masked_im2col_forward/masked_im2col_forward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/masked_im2col_forward/masked_im2col_forward.cpp @@ -53,11 +53,11 @@ class masked_im2col_forward : public testing::Test { if (feature) { if (feature_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&feature_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(feature_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&feature_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -72,12 +72,12 @@ class masked_im2col_forward : public testing::Test { if (mask_h_idx) { if (mask_h_idx_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&mask_h_idx_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum(mask_h_idx_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&mask_h_idx_, MLUOP_DTYPE_INT32 * 2)); } } @@ -92,12 +92,12 @@ class masked_im2col_forward : public testing::Test { if (mask_w_idx) { if (mask_w_idx_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&mask_w_idx_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum(mask_w_idx_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&mask_w_idx_, MLUOP_DTYPE_INT32 * 2)); } } @@ -112,18 +112,18 @@ class masked_im2col_forward : public testing::Test { if (data_col) { if (data_col_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&data_col_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(data_col_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&data_col_, MLUOP_DTYPE_FLOAT * 2)); } } if (workspace) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, MLUOP_DTYPE_FLOAT * workspace_size_)); } } @@ -154,7 +154,7 @@ class masked_im2col_forward : public testing::Test { if (feature_) { VLOG(4) << "Destroy feature_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(feature_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(feature_)); feature_ = nullptr; } @@ -166,7 +166,7 @@ class masked_im2col_forward : public testing::Test { if (mask_h_idx_) { VLOG(4) << "Destroy mask_h_idx_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(mask_h_idx_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(mask_h_idx_)); mask_h_idx_ = nullptr; } @@ -178,7 +178,7 @@ class masked_im2col_forward : public testing::Test { if (mask_w_idx_) { VLOG(4) << "Destroy mask_w_idx_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(mask_w_idx_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(mask_w_idx_)); mask_w_idx_ = nullptr; } @@ -190,13 +190,13 @@ class masked_im2col_forward : public testing::Test { if (data_col_) { VLOG(4) << "Destroy data_col_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(data_col_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(data_col_)); data_col_ = nullptr; } if (workspace_) { VLOG(4) << "Destroy workspace_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/masked_im2col_forward/masked_im2col_forward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/masked_im2col_forward/masked_im2col_forward_general.cpp index 6363a608c..aea425777 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/masked_im2col_forward/masked_im2col_forward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/masked_im2col_forward/masked_im2col_forward_general.cpp @@ -52,11 +52,11 @@ class masked_im2col_forward_general feature_params.get_dim_size().data())); if (mluOpGetTensorElementNum(feature_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&feature_, mluOpDataTypeBytes(feature_params.get_dtype()) * 2)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&feature_, mluOpDataTypeBytes(feature_params.get_dtype()) * mluOpGetTensorElementNum(feature_desc_))); @@ -70,12 +70,12 @@ class masked_im2col_forward_general mask_h_idx_params.get_dim_size().data())); if (mluOpGetTensorElementNum(mask_h_idx_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&mask_h_idx_, mluOpDataTypeBytes(mask_h_idx_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&mask_h_idx_, mluOpDataTypeBytes(mask_h_idx_params.get_dtype()) * mluOpGetTensorElementNum(mask_h_idx_desc_))); @@ -89,12 +89,12 @@ class masked_im2col_forward_general mask_w_idx_params.get_dim_size().data())); if (mluOpGetTensorElementNum(mask_w_idx_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&mask_w_idx_, mluOpDataTypeBytes(mask_w_idx_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&mask_w_idx_, mluOpDataTypeBytes(mask_w_idx_params.get_dtype()) * mluOpGetTensorElementNum(mask_w_idx_desc_))); @@ -108,11 +108,11 @@ class masked_im2col_forward_general data_col_params.get_dim_size().data())); if (mluOpGetTensorElementNum(data_col_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&data_col_, mluOpDataTypeBytes(data_col_params.get_dtype()) * 2)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&data_col_, mluOpDataTypeBytes(data_col_params.get_dtype()) * mluOpGetTensorElementNum(data_col_desc_))); @@ -123,7 +123,7 @@ class masked_im2col_forward_general target_device_ = std::get<6>(GetParam()); expected_status_ = std::get<7>(GetParam()); - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, MLUOP_DTYPE_FLOAT * workspace_size_)); } catch (const std::exception &e) { FAIL() << "MLUOPAPIGTEST: catched " << e.what() @@ -161,7 +161,7 @@ class masked_im2col_forward_general if (feature_) { VLOG(4) << "Destroy feature_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(feature_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(feature_)); feature_ = nullptr; } @@ -173,7 +173,7 @@ class masked_im2col_forward_general if (mask_h_idx_) { VLOG(4) << "Destroy mask_h_idx_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(mask_h_idx_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(mask_h_idx_)); mask_h_idx_ = nullptr; } @@ -185,7 +185,7 @@ class masked_im2col_forward_general if (mask_w_idx_) { VLOG(4) << "Destroy mask_w_idx_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(mask_w_idx_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(mask_w_idx_)); mask_w_idx_ = nullptr; } @@ -197,13 +197,13 @@ class masked_im2col_forward_general if (data_col_) { VLOG(4) << "Destroy data_col_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(data_col_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(data_col_)); data_col_ = nullptr; } if (workspace_) { VLOG(4) << "Destroy workspace_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp index 1512f82f9..43d60f343 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp @@ -53,12 +53,12 @@ class moe_dispatch_backward_data : public testing::Test { if (gates) { if (gates_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&gates_, mluOpGetTensorElementNum(gates_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&gates_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } } @@ -74,12 +74,12 @@ class moe_dispatch_backward_data : public testing::Test { if (indices) { if (indices_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&indices_, mluOpGetTensorElementNum(indices_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&indices_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } } @@ -95,11 +95,11 @@ class moe_dispatch_backward_data : public testing::Test { if (locations) { if (locations_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&locations_, mluOpGetTensorElementNum(locations_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&locations_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } @@ -116,12 +116,12 @@ class moe_dispatch_backward_data : public testing::Test { if (dispatch) { if (dispatch_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&dispatch_, mluOpGetTensorElementNum(dispatch_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&dispatch_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } } @@ -136,12 +136,12 @@ class moe_dispatch_backward_data : public testing::Test { if (grad_input) { if (grad_input_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_input_, mluOpGetTensorElementNum(grad_input_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_input_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } @@ -173,7 +173,7 @@ class moe_dispatch_backward_data : public testing::Test { if (gates_) { VLOG(4) << "Destroy gates_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(gates_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(gates_)); gates_ = nullptr; } @@ -185,7 +185,7 @@ class moe_dispatch_backward_data : public testing::Test { if (indices_) { VLOG(4) << "Destroy indices_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indices_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indices_)); indices_ = nullptr; } @@ -197,7 +197,7 @@ class moe_dispatch_backward_data : public testing::Test { if (locations_) { VLOG(4) << "Destroy locations_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(locations_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(locations_)); locations_ = nullptr; } @@ -209,7 +209,7 @@ class moe_dispatch_backward_data : public testing::Test { if (dispatch_) { VLOG(4) << "Destroy dispatch_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(dispatch_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(dispatch_)); dispatch_ = nullptr; } @@ -221,7 +221,7 @@ class moe_dispatch_backward_data : public testing::Test { if (grad_input_) { VLOG(4) << "Destroy grad_input_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_input_)); grad_input_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_backward_data/moe_dispatch_backward_data_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_backward_data/moe_dispatch_backward_data_general.cpp index af9455cd5..e9dd2e30c 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_backward_data/moe_dispatch_backward_data_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_backward_data/moe_dispatch_backward_data_general.cpp @@ -64,12 +64,12 @@ class moe_dispatch_backward_data_general if (mluOpGetTensorElementNum(gates_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&gates_, mluOpDataTypeBytes(gates_params.get_dtype()) * 10)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&gates_, mluOpDataTypeBytes(gates_params.get_dtype()) * mluOpGetTensorElementNum(gates_desc_))); } @@ -82,11 +82,11 @@ class moe_dispatch_backward_data_general indices_params.get_dim_size().data())); if (mluOpGetTensorElementNum(indices_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&indices_, mluOpDataTypeBytes(indices_params.get_dtype()) * 10)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indices_, mluOpDataTypeBytes(indices_params.get_dtype()) * mluOpGetTensorElementNum(indices_desc_))); @@ -100,12 +100,12 @@ class moe_dispatch_backward_data_general locations_params.get_dim_size().data())); if (mluOpGetTensorElementNum(locations_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&locations_, mluOpDataTypeBytes(locations_params.get_dtype()) * 10)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&locations_, mluOpDataTypeBytes(locations_params.get_dtype()) * mluOpGetTensorElementNum(locations_desc_))); @@ -119,11 +119,11 @@ class moe_dispatch_backward_data_general dispatch_params.get_dim_size().data())); if (mluOpGetTensorElementNum(dispatch_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&dispatch_, mluOpDataTypeBytes(dispatch_params.get_dtype()) * 10)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&dispatch_, mluOpDataTypeBytes(dispatch_params.get_dtype()) * mluOpGetTensorElementNum(dispatch_desc_))); @@ -137,12 +137,12 @@ class moe_dispatch_backward_data_general grad_input_params.get_dim_size().data())); if (mluOpGetTensorElementNum(grad_input_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_input_, mluOpDataTypeBytes(grad_input_params.get_dtype()) * 10)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_input_, mluOpDataTypeBytes(grad_input_params.get_dtype()) * mluOpGetTensorElementNum(grad_input_desc_))); @@ -182,7 +182,7 @@ class moe_dispatch_backward_data_general if (gates_) { VLOG(4) << "Destroy gates"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(gates_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(gates_)); gates_ = nullptr; } @@ -194,7 +194,7 @@ class moe_dispatch_backward_data_general if (indices_) { VLOG(4) << "Destroy indices"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indices_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indices_)); indices_ = nullptr; } @@ -206,7 +206,7 @@ class moe_dispatch_backward_data_general if (locations_) { VLOG(4) << "Destroy locations"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(locations_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(locations_)); locations_ = nullptr; } @@ -218,7 +218,7 @@ class moe_dispatch_backward_data_general if (dispatch_) { VLOG(4) << "Destroy dispatch"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(dispatch_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(dispatch_)); dispatch_ = nullptr; } @@ -230,7 +230,7 @@ class moe_dispatch_backward_data_general if (grad_input_) { VLOG(4) << "Destroy dispatch"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_input_)); grad_input_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_backward_gate/moe_dispatch_backward_gate.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_backward_gate/moe_dispatch_backward_gate.cpp index a2f5458b7..f73178993 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_backward_gate/moe_dispatch_backward_gate.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_backward_gate/moe_dispatch_backward_gate.cpp @@ -53,12 +53,12 @@ class moe_dispatch_backward_gate : public testing::Test { if (indices) { if (indices_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&indices_, mluOpGetTensorElementNum(indices_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&indices_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } } @@ -74,11 +74,11 @@ class moe_dispatch_backward_gate : public testing::Test { if (locations) { if (locations_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&locations_, mluOpGetTensorElementNum(locations_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&locations_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } @@ -95,12 +95,12 @@ class moe_dispatch_backward_gate : public testing::Test { if (input) { if (input_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&input_, mluOpGetTensorElementNum(input_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&input_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } } @@ -116,12 +116,12 @@ class moe_dispatch_backward_gate : public testing::Test { if (dispatch) { if (dispatch_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&dispatch_, mluOpGetTensorElementNum(dispatch_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&dispatch_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } } @@ -136,19 +136,19 @@ class moe_dispatch_backward_gate : public testing::Test { if (grad_gates) { if (grad_gates_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_gates_, mluOpGetTensorElementNum(grad_gates_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_gates_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } } if (workspace) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)); } } mluOpStatus_t compute() { @@ -178,7 +178,7 @@ class moe_dispatch_backward_gate : public testing::Test { if (indices_) { VLOG(4) << "Destroy indices_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indices_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indices_)); indices_ = nullptr; } @@ -190,7 +190,7 @@ class moe_dispatch_backward_gate : public testing::Test { if (locations_) { VLOG(4) << "Destroy locations_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(locations_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(locations_)); locations_ = nullptr; } @@ -202,7 +202,7 @@ class moe_dispatch_backward_gate : public testing::Test { if (input_) { VLOG(4) << "Destroy input_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); input_ = nullptr; } @@ -214,7 +214,7 @@ class moe_dispatch_backward_gate : public testing::Test { if (dispatch_) { VLOG(4) << "Destroy dispatch_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(dispatch_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(dispatch_)); dispatch_ = nullptr; } @@ -226,13 +226,13 @@ class moe_dispatch_backward_gate : public testing::Test { if (grad_gates_) { VLOG(4) << "Destroy grad_gates_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_gates_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_gates_)); grad_gates_ = nullptr; } if (workspace_) { VLOG(4) << "Destroy workspace"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_backward_gate/moe_dispatch_backward_gate_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_backward_gate/moe_dispatch_backward_gate_general.cpp index 43dcd9a9a..575f9993d 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_backward_gate/moe_dispatch_backward_gate_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_backward_gate/moe_dispatch_backward_gate_general.cpp @@ -65,11 +65,11 @@ class moe_dispatch_backward_gate_general indices_params.get_dim_size().data())); if (mluOpGetTensorElementNum(indices_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&indices_, mluOpDataTypeBytes(indices_params.get_dtype()) * 10)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indices_, mluOpDataTypeBytes(indices_params.get_dtype()) * mluOpGetTensorElementNum(indices_desc_))); @@ -83,12 +83,12 @@ class moe_dispatch_backward_gate_general locations_params.get_dim_size().data())); if (mluOpGetTensorElementNum(locations_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&locations_, mluOpDataTypeBytes(locations_params.get_dtype()) * 10)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&locations_, mluOpDataTypeBytes(locations_params.get_dtype()) * mluOpGetTensorElementNum(locations_desc_))); @@ -102,12 +102,12 @@ class moe_dispatch_backward_gate_general if (mluOpGetTensorElementNum(input_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&input_, mluOpDataTypeBytes(input_params.get_dtype()) * 10)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&input_, mluOpDataTypeBytes(input_params.get_dtype()) * mluOpGetTensorElementNum(input_desc_))); } @@ -120,11 +120,11 @@ class moe_dispatch_backward_gate_general dispatch_params.get_dim_size().data())); if (mluOpGetTensorElementNum(dispatch_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&dispatch_, mluOpDataTypeBytes(dispatch_params.get_dtype()) * 10)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&dispatch_, mluOpDataTypeBytes(dispatch_params.get_dtype()) * mluOpGetTensorElementNum(dispatch_desc_))); @@ -138,12 +138,12 @@ class moe_dispatch_backward_gate_general grad_gates_params.get_dim_size().data())); if (mluOpGetTensorElementNum(grad_gates_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_gates_, mluOpDataTypeBytes(grad_gates_params.get_dtype()) * 10)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_gates_, mluOpDataTypeBytes(grad_gates_params.get_dtype()) * mluOpGetTensorElementNum(grad_gates_desc_))); @@ -167,7 +167,7 @@ class moe_dispatch_backward_gate_general destroy(); return expected_status_ == status; } - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)); status = mluOpMoeDispatchBackwardGate( handle_, indices_desc_, indices_, locations_desc_, locations_, @@ -192,7 +192,7 @@ class moe_dispatch_backward_gate_general if (indices_) { VLOG(4) << "Destroy indices_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indices_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indices_)); indices_ = nullptr; } @@ -204,7 +204,7 @@ class moe_dispatch_backward_gate_general if (locations_) { VLOG(4) << "Destroy locations_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(locations_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(locations_)); locations_ = nullptr; } @@ -216,7 +216,7 @@ class moe_dispatch_backward_gate_general if (input_) { VLOG(4) << "Destroy input_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); input_ = nullptr; } @@ -228,7 +228,7 @@ class moe_dispatch_backward_gate_general if (dispatch_) { VLOG(4) << "Destroy dispatch_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(dispatch_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(dispatch_)); dispatch_ = nullptr; } @@ -240,7 +240,7 @@ class moe_dispatch_backward_gate_general if (grad_gates_) { VLOG(4) << "Destroy grad_gates_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_gates_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_gates_)); grad_gates_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_forward/moe_dispatch_forward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_forward/moe_dispatch_forward.cpp index 1a3c0a930..5c21162aa 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_forward/moe_dispatch_forward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_forward/moe_dispatch_forward.cpp @@ -51,12 +51,12 @@ class moe_dispatch_forward : public testing::Test { if (gates) { if (gates_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&gates_, mluOpGetTensorElementNum(gates_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&gates_, 2 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } } @@ -70,12 +70,12 @@ class moe_dispatch_forward : public testing::Test { if (indices) { if (indices_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&indices_, mluOpGetTensorElementNum(indices_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&indices_, 2 * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } } @@ -89,12 +89,12 @@ class moe_dispatch_forward : public testing::Test { if (locations) { if (locations_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&locations_, mluOpGetTensorElementNum(locations_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&locations_, 2 * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } } @@ -108,12 +108,12 @@ class moe_dispatch_forward : public testing::Test { if (input) { if (input_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&input_, mluOpGetTensorElementNum(input_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&input_, 4 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } } @@ -127,12 +127,12 @@ class moe_dispatch_forward : public testing::Test { if (dispatch) { if (dispatch_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&dispatch_, mluOpGetTensorElementNum(dispatch_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&dispatch_, 8 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } } @@ -159,7 +159,7 @@ class moe_dispatch_forward : public testing::Test { gates_desc_ = NULL; } if (gates_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(gates_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(gates_)); gates_ = NULL; } if (indices_desc_) { @@ -167,7 +167,7 @@ class moe_dispatch_forward : public testing::Test { indices_desc_ = NULL; } if (indices_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indices_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indices_)); indices_ = NULL; } if (locations_desc_) { @@ -175,7 +175,7 @@ class moe_dispatch_forward : public testing::Test { locations_desc_ = NULL; } if (locations_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(locations_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(locations_)); locations_ = NULL; } if (input_desc_) { @@ -183,7 +183,7 @@ class moe_dispatch_forward : public testing::Test { input_desc_ = NULL; } if (input_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); input_ = NULL; } if (dispatch_desc_) { @@ -191,7 +191,7 @@ class moe_dispatch_forward : public testing::Test { dispatch_desc_ = NULL; } if (dispatch_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(dispatch_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(dispatch_)); dispatch_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_forward/moe_dispatch_forward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_forward/moe_dispatch_forward_general.cpp index 217e03b2a..5b0c2c839 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_forward/moe_dispatch_forward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/moe_dispatch_forward/moe_dispatch_forward_general.cpp @@ -64,10 +64,10 @@ class moe_dispatch_forward_general uint gates_ele_num = mluOpGetTensorElementNum(gates_desc_); if (gates_ele_num > 0) { if (mluOpGetTensorElementNum(gates_desc_) >= LARGE_TENSOR_NUM) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&gates_, 12 * mluOpDataTypeBytes(gates_dtype))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&gates_, gates_ele_num * mluOpDataTypeBytes(gates_dtype))); } @@ -86,11 +86,11 @@ class moe_dispatch_forward_general if (indices_ele_num > 0) { if (mluOpGetTensorElementNum(indices_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&indices_, 12 * mluOpDataTypeBytes(indices_dtype))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&indices_, indices_ele_num * mluOpDataTypeBytes(indices_dtype))); } @@ -109,11 +109,11 @@ class moe_dispatch_forward_general if (locations_ele_num > 0) { if (mluOpGetTensorElementNum(locations_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&locations_, 12 * mluOpDataTypeBytes(locations_dtype))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&locations_, locations_ele_num * mluOpDataTypeBytes(locations_dtype))); } @@ -130,10 +130,10 @@ class moe_dispatch_forward_general uint input_ele_num = mluOpGetTensorElementNum(input_desc_); if (input_ele_num > 0) { if (mluOpGetTensorElementNum(input_desc_) >= LARGE_TENSOR_NUM) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_, 12 * mluOpDataTypeBytes(input_dtype))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_, input_ele_num * mluOpDataTypeBytes(input_dtype))); } @@ -155,11 +155,11 @@ class moe_dispatch_forward_general if (dispatch_ele_num > 0) { if (mluOpGetTensorElementNum(dispatch_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&dispatch_, 12 * mluOpDataTypeBytes(dispatch_dtype))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&dispatch_, dispatch_ele_num * mluOpDataTypeBytes(dispatch_dtype))); } @@ -193,7 +193,7 @@ class moe_dispatch_forward_general gates_desc_ = NULL; } if (gates_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(gates_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(gates_)); gates_ = NULL; } if (indices_desc_) { @@ -201,7 +201,7 @@ class moe_dispatch_forward_general indices_desc_ = NULL; } if (indices_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indices_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indices_)); indices_ = NULL; } if (locations_desc_) { @@ -209,7 +209,7 @@ class moe_dispatch_forward_general locations_desc_ = NULL; } if (locations_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(locations_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(locations_)); locations_ = NULL; } if (input_desc_) { @@ -217,7 +217,7 @@ class moe_dispatch_forward_general input_desc_ = NULL; } if (input_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); input_ = NULL; } if (dispatch_desc_) { @@ -225,7 +225,7 @@ class moe_dispatch_forward_general dispatch_desc_ = NULL; } if (dispatch_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(dispatch_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(dispatch_)); dispatch_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/ms_deform_attn_backward/ms_deform_attn_backward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/ms_deform_attn_backward/ms_deform_attn_backward.cpp index 53f985e37..5288772c6 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/ms_deform_attn_backward/ms_deform_attn_backward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/ms_deform_attn_backward/ms_deform_attn_backward.cpp @@ -57,12 +57,12 @@ class ms_deform_attn_backward : public testing::Test { if (value) { if (value_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&value_, mluOpGetTensorElementNum(value_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&value_, 2 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } } @@ -77,12 +77,12 @@ class ms_deform_attn_backward : public testing::Test { if (spatial_shapes) { if (spatial_shapes_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&spatial_shapes_, mluOpGetTensorElementNum(spatial_shapes_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&spatial_shapes_, 2 * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } @@ -99,12 +99,12 @@ class ms_deform_attn_backward : public testing::Test { if (level_start_index) { if (level_start_index_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&level_start_index_, mluOpGetTensorElementNum(level_start_index_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&level_start_index_, 2 * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } @@ -120,12 +120,12 @@ class ms_deform_attn_backward : public testing::Test { if (sampling_loc) { if (sampling_loc_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&sampling_loc_, mluOpGetTensorElementNum(sampling_loc_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&sampling_loc_, 2 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } @@ -141,12 +141,12 @@ class ms_deform_attn_backward : public testing::Test { if (attn_weight) { if (attn_weight_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&attn_weight_, mluOpGetTensorElementNum(attn_weight_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&attn_weight_, 2 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } @@ -162,12 +162,12 @@ class ms_deform_attn_backward : public testing::Test { if (grad_output) { if (grad_output_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_output_, mluOpGetTensorElementNum(grad_output_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_output_, 2 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } @@ -183,12 +183,12 @@ class ms_deform_attn_backward : public testing::Test { if (grad_value) { if (grad_value_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_value_, mluOpGetTensorElementNum(grad_value_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_value_, 2 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } @@ -205,12 +205,12 @@ class ms_deform_attn_backward : public testing::Test { if (grad_sampling_loc) { if (grad_sampling_loc_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_sampling_loc_, mluOpGetTensorElementNum(grad_sampling_loc_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_sampling_loc_, 2 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } @@ -227,12 +227,12 @@ class ms_deform_attn_backward : public testing::Test { if (grad_attn_weight) { if (grad_attn_weight_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_attn_weight_, mluOpGetTensorElementNum(grad_attn_weight_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_attn_weight_, 2 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } @@ -268,7 +268,7 @@ class ms_deform_attn_backward : public testing::Test { if (value_) { VLOG(4) << "Destroy value"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(value_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(value_)); value_ = nullptr; } @@ -280,7 +280,7 @@ class ms_deform_attn_backward : public testing::Test { if (spatial_shapes_) { VLOG(4) << "Destroy spatial_shapes"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(spatial_shapes_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(spatial_shapes_)); spatial_shapes_ = nullptr; } @@ -292,7 +292,7 @@ class ms_deform_attn_backward : public testing::Test { if (level_start_index_) { VLOG(4) << "Destroy level_start_index"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(level_start_index_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(level_start_index_)); level_start_index_ = nullptr; } @@ -304,7 +304,7 @@ class ms_deform_attn_backward : public testing::Test { if (sampling_loc_) { VLOG(4) << "Destroy sampling_loc"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(sampling_loc_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(sampling_loc_)); sampling_loc_ = nullptr; } @@ -316,7 +316,7 @@ class ms_deform_attn_backward : public testing::Test { if (attn_weight_) { VLOG(4) << "Destroy attn_weight"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(attn_weight_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(attn_weight_)); attn_weight_ = nullptr; } @@ -328,7 +328,7 @@ class ms_deform_attn_backward : public testing::Test { if (grad_output_) { VLOG(4) << "Destroy grad_output"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_output_)); grad_output_ = nullptr; } @@ -340,7 +340,7 @@ class ms_deform_attn_backward : public testing::Test { if (grad_value_) { VLOG(4) << "Destroy grad_value"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_value_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_value_)); grad_value_ = nullptr; } @@ -352,7 +352,7 @@ class ms_deform_attn_backward : public testing::Test { if (grad_sampling_loc_) { VLOG(4) << "Destroy grad_sampling_loc"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_sampling_loc_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_sampling_loc_)); grad_sampling_loc_ = nullptr; } @@ -364,7 +364,7 @@ class ms_deform_attn_backward : public testing::Test { if (grad_attn_weight_) { VLOG(4) << "Destroy grad_attn_weight"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_attn_weight_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_attn_weight_)); grad_attn_weight_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/ms_deform_attn_backward/ms_deform_attn_backward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/ms_deform_attn_backward/ms_deform_attn_backward_general.cpp index 36e42ce63..98fd196fb 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/ms_deform_attn_backward/ms_deform_attn_backward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/ms_deform_attn_backward/ms_deform_attn_backward_general.cpp @@ -58,12 +58,12 @@ class ms_deform_attn_backward_general if (mluOpGetTensorElementNum(value_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&value_, mluOpDataTypeBytes(value_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&value_, mluOpDataTypeBytes(value_params.get_dtype()) * mluOpGetTensorElementNum(value_desc_))); } @@ -76,13 +76,13 @@ class ms_deform_attn_backward_general spatial_shapes_params.get_dim_size().data())); if (mluOpGetTensorElementNum(spatial_shapes_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &spatial_shapes_, mluOpDataTypeBytes(spatial_shapes_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&spatial_shapes_, mluOpDataTypeBytes(spatial_shapes_params.get_dtype()) * mluOpGetTensorElementNum(spatial_shapes_desc_))); @@ -99,13 +99,13 @@ class ms_deform_attn_backward_general if (mluOpGetTensorElementNum(level_start_index_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &level_start_index_, mluOpDataTypeBytes(level_start_index_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &level_start_index_, mluOpDataTypeBytes(level_start_index_params.get_dtype()) * @@ -120,13 +120,13 @@ class ms_deform_attn_backward_general sampling_loc_params.get_dim_size().data())); if (mluOpGetTensorElementNum(sampling_loc_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &sampling_loc_, mluOpDataTypeBytes(sampling_loc_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&sampling_loc_, mluOpDataTypeBytes(sampling_loc_params.get_dtype()) * mluOpGetTensorElementNum(sampling_loc_desc_))); @@ -140,12 +140,12 @@ class ms_deform_attn_backward_general attn_weight_params.get_dim_size().data())); if (mluOpGetTensorElementNum(attn_weight_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&attn_weight_, mluOpDataTypeBytes(attn_weight_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&attn_weight_, mluOpDataTypeBytes(attn_weight_params.get_dtype()) * mluOpGetTensorElementNum(attn_weight_desc_))); @@ -159,12 +159,12 @@ class ms_deform_attn_backward_general grad_output_params.get_dim_size().data())); if (mluOpGetTensorElementNum(grad_output_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_output_, mluOpDataTypeBytes(grad_output_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_output_, mluOpDataTypeBytes(grad_output_params.get_dtype()) * mluOpGetTensorElementNum(grad_output_desc_))); @@ -178,12 +178,12 @@ class ms_deform_attn_backward_general grad_value_params.get_dim_size().data())); if (mluOpGetTensorElementNum(grad_value_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_value_, mluOpDataTypeBytes(grad_value_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_value_, mluOpDataTypeBytes(grad_value_params.get_dtype()) * mluOpGetTensorElementNum(grad_value_desc_))); @@ -199,13 +199,13 @@ class ms_deform_attn_backward_general if (mluOpGetTensorElementNum(grad_sampling_loc_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &grad_sampling_loc_, mluOpDataTypeBytes(grad_sampling_loc_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &grad_sampling_loc_, mluOpDataTypeBytes(grad_sampling_loc_params.get_dtype()) * @@ -222,13 +222,13 @@ class ms_deform_attn_backward_general if (mluOpGetTensorElementNum(grad_attn_weight_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &grad_attn_weight_, mluOpDataTypeBytes(grad_attn_weight_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_attn_weight_, mluOpDataTypeBytes(grad_attn_weight_params.get_dtype()) * mluOpGetTensorElementNum(grad_attn_weight_desc_))); @@ -277,7 +277,7 @@ class ms_deform_attn_backward_general if (value_) { VLOG(4) << "Destroy value"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(value_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(value_)); value_ = nullptr; } @@ -289,7 +289,7 @@ class ms_deform_attn_backward_general if (spatial_shapes_) { VLOG(4) << "Destroy spatial_shapes"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(spatial_shapes_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(spatial_shapes_)); spatial_shapes_ = nullptr; } @@ -301,7 +301,7 @@ class ms_deform_attn_backward_general if (level_start_index_) { VLOG(4) << "Destroy level_start_index"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(level_start_index_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(level_start_index_)); level_start_index_ = nullptr; } @@ -313,7 +313,7 @@ class ms_deform_attn_backward_general if (sampling_loc_) { VLOG(4) << "Destroy sampling_loc"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(sampling_loc_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(sampling_loc_)); sampling_loc_ = nullptr; } @@ -325,7 +325,7 @@ class ms_deform_attn_backward_general if (attn_weight_) { VLOG(4) << "Destroy attn_weight"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(attn_weight_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(attn_weight_)); attn_weight_ = nullptr; } @@ -337,7 +337,7 @@ class ms_deform_attn_backward_general if (grad_output_) { VLOG(4) << "Destroy grad_output"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_output_)); grad_output_ = nullptr; } @@ -349,7 +349,7 @@ class ms_deform_attn_backward_general if (grad_value_) { VLOG(4) << "Destroy grad_value"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_value_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_value_)); grad_value_ = nullptr; } @@ -361,7 +361,7 @@ class ms_deform_attn_backward_general if (grad_sampling_loc_) { VLOG(4) << "Destroy grad_sampling_loc"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_sampling_loc_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_sampling_loc_)); grad_sampling_loc_ = nullptr; } @@ -373,7 +373,7 @@ class ms_deform_attn_backward_general if (grad_attn_weight_) { VLOG(4) << "Destroy grad_attn_weight"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_attn_weight_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_attn_weight_)); grad_attn_weight_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/ms_deform_attn_forward/ms_deform_attn_forward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/ms_deform_attn_forward/ms_deform_attn_forward.cpp index 2def0863f..6fa424646 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/ms_deform_attn_forward/ms_deform_attn_forward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/ms_deform_attn_forward/ms_deform_attn_forward.cpp @@ -54,12 +54,12 @@ class ms_deform_attn_forward : public testing::Test { if (data_value) { if (data_value_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&data_value_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum(data_value_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&data_value_, MLUOP_DTYPE_INT32 * 2)); } } @@ -75,12 +75,12 @@ class ms_deform_attn_forward : public testing::Test { if (data_spatial_shapes) { if (data_spatial_shapes_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&data_spatial_shapes_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum( data_spatial_shapes_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&data_spatial_shapes_, MLUOP_DTYPE_INT32 * 2)); } } @@ -96,12 +96,12 @@ class ms_deform_attn_forward : public testing::Test { if (data_level_start_index) { if (data_level_start_index_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&data_level_start_index_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum( data_level_start_index_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&data_level_start_index_, + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&data_level_start_index_, MLUOP_DTYPE_INT32 * 2)); } } @@ -117,12 +117,12 @@ class ms_deform_attn_forward : public testing::Test { if (data_sampling_loc) { if (data_sampling_loc_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&data_sampling_loc_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum(data_sampling_loc_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&data_sampling_loc_, MLUOP_DTYPE_INT32 * 2)); } } @@ -138,12 +138,12 @@ class ms_deform_attn_forward : public testing::Test { if (data_attn_weight) { if (data_attn_weight_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&data_attn_weight_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum(data_attn_weight_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&data_attn_weight_, MLUOP_DTYPE_INT32 * 2)); } } @@ -158,12 +158,12 @@ class ms_deform_attn_forward : public testing::Test { if (data_col) { if (data_col_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&data_col_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum(data_col_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&data_col_, MLUOP_DTYPE_INT32 * 2)); } } @@ -197,7 +197,7 @@ class ms_deform_attn_forward : public testing::Test { if (data_value_) { VLOG(4) << "Destroy data_value"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(data_value_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(data_value_)); data_value_ = nullptr; } @@ -209,7 +209,7 @@ class ms_deform_attn_forward : public testing::Test { if (data_spatial_shapes_) { VLOG(4) << "Destroy data_spatial_shapes"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(data_spatial_shapes_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(data_spatial_shapes_)); data_spatial_shapes_ = nullptr; } @@ -221,7 +221,7 @@ class ms_deform_attn_forward : public testing::Test { if (data_level_start_index_) { VLOG(4) << "Destroy data_level_start_index"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(data_level_start_index_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(data_level_start_index_)); data_level_start_index_ = nullptr; } @@ -233,7 +233,7 @@ class ms_deform_attn_forward : public testing::Test { if (data_sampling_loc_) { VLOG(4) << "Destroy data_sampling_loc"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(data_sampling_loc_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(data_sampling_loc_)); data_sampling_loc_ = nullptr; } @@ -245,7 +245,7 @@ class ms_deform_attn_forward : public testing::Test { if (data_attn_weight_) { VLOG(4) << "Destroy data_attn_weight"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(data_attn_weight_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(data_attn_weight_)); data_attn_weight_ = nullptr; } @@ -257,7 +257,7 @@ class ms_deform_attn_forward : public testing::Test { if (data_col_) { VLOG(4) << "Destroy data_col"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(data_col_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(data_col_)); data_col_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/ms_deform_attn_forward/ms_deform_attn_forward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/ms_deform_attn_forward/ms_deform_attn_forward_general.cpp index 5527e5edd..3b10eccd9 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/ms_deform_attn_forward/ms_deform_attn_forward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/ms_deform_attn_forward/ms_deform_attn_forward_general.cpp @@ -54,12 +54,12 @@ class ms_deform_attn_forward_general data_value_params.get_dim_size().data())); if (mluOpGetTensorElementNum(data_value_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&data_value_, mluOpDataTypeBytes(data_value_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&data_value_, mluOpDataTypeBytes(data_value_params.get_dtype()) * mluOpGetTensorElementNum(data_value_desc_))); @@ -75,14 +75,14 @@ class ms_deform_attn_forward_general if (mluOpGetTensorElementNum(data_spatial_shapes_desc_) >= LARGE_TENSOR_NUM) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&data_spatial_shapes_, mluOpDataTypeBytes( data_spatial_shapes_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &data_spatial_shapes_, mluOpDataTypeBytes(data_spatial_shapes_params.get_dtype()) * @@ -100,14 +100,14 @@ class ms_deform_attn_forward_general if (mluOpGetTensorElementNum(data_level_start_index_desc_) >= LARGE_TENSOR_NUM) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&data_level_start_index_, mluOpDataTypeBytes( data_level_start_index_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &data_level_start_index_, mluOpDataTypeBytes(data_level_start_index_params.get_dtype()) * @@ -125,13 +125,13 @@ class ms_deform_attn_forward_general if (mluOpGetTensorElementNum(data_sampling_loc_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &data_sampling_loc_, mluOpDataTypeBytes(data_sampling_loc_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &data_sampling_loc_, mluOpDataTypeBytes(data_sampling_loc_params.get_dtype()) * @@ -149,13 +149,13 @@ class ms_deform_attn_forward_general if (mluOpGetTensorElementNum(data_attn_weight_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &data_attn_weight_, mluOpDataTypeBytes(data_attn_weight_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&data_attn_weight_, mluOpDataTypeBytes(data_attn_weight_params.get_dtype()) * mluOpGetTensorElementNum(data_attn_weight_desc_))); @@ -170,11 +170,11 @@ class ms_deform_attn_forward_general if (mluOpGetTensorElementNum(data_col_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&data_col_, mluOpDataTypeBytes(data_col_params.get_dtype()) * 2)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&data_col_, mluOpDataTypeBytes(data_col_params.get_dtype()) * mluOpGetTensorElementNum(data_col_desc_))); @@ -218,7 +218,7 @@ class ms_deform_attn_forward_general } if (data_value_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(data_value_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(data_value_)); data_value_ = nullptr; } @@ -228,7 +228,7 @@ class ms_deform_attn_forward_general } if (data_spatial_shapes_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(data_spatial_shapes_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(data_spatial_shapes_)); data_spatial_shapes_ = nullptr; } @@ -238,7 +238,7 @@ class ms_deform_attn_forward_general } if (data_level_start_index_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(data_level_start_index_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(data_level_start_index_)); data_level_start_index_ = nullptr; } @@ -248,7 +248,7 @@ class ms_deform_attn_forward_general } if (data_sampling_loc_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(data_sampling_loc_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(data_sampling_loc_)); data_sampling_loc_ = nullptr; } @@ -258,7 +258,7 @@ class ms_deform_attn_forward_general } if (data_attn_weight_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(data_attn_weight_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(data_attn_weight_)); data_attn_weight_ = nullptr; } @@ -268,7 +268,7 @@ class ms_deform_attn_forward_general } if (data_col_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(data_col_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(data_col_)); data_col_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/mutual_information_backward/mutual_information_backward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/mutual_information_backward/mutual_information_backward.cpp index e452f5fd9..34900e815 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/mutual_information_backward/mutual_information_backward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/mutual_information_backward/mutual_information_backward.cpp @@ -52,11 +52,11 @@ class mutual_information_backward : public testing::Test { if (px) { if (px_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&px_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(px_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&px_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -71,11 +71,11 @@ class mutual_information_backward : public testing::Test { if (py) { if (py_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&py_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(py_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&py_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -90,12 +90,12 @@ class mutual_information_backward : public testing::Test { if (opt_boundary) { if (opt_boundary_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&opt_boundary_, MLUOP_DTYPE_INT64 * mluOpGetTensorElementNum( opt_boundary_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&opt_boundary_, MLUOP_DTYPE_INT64 * 2)); } } @@ -110,11 +110,11 @@ class mutual_information_backward : public testing::Test { if (p) { if (p_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&p_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(p_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&p_, MLUOP_DTYPE_FLOAT * 2)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&p_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -128,12 +128,12 @@ class mutual_information_backward : public testing::Test { if (ans_grad) { if (ans_grad_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&ans_grad_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(ans_grad_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&ans_grad_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -149,11 +149,11 @@ class mutual_information_backward : public testing::Test { if (px_grad) { if (px_grad_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&px_grad_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(px_grad_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&px_grad_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -169,17 +169,17 @@ class mutual_information_backward : public testing::Test { if (py_grad) { if (py_grad_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&py_grad_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(py_grad_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&py_grad_, MLUOP_DTYPE_FLOAT * 2)); } } if (workspace) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, MLUOP_DTYPE_FLOAT * workspace_size_)); } } @@ -211,7 +211,7 @@ class mutual_information_backward : public testing::Test { if (px_) { VLOG(4) << "Destroy px_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(px_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(px_)); px_ = nullptr; } @@ -223,7 +223,7 @@ class mutual_information_backward : public testing::Test { if (py_) { VLOG(4) << "Destroy py_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(py_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(py_)); py_ = nullptr; } @@ -235,7 +235,7 @@ class mutual_information_backward : public testing::Test { if (opt_boundary_) { VLOG(4) << "Destroy opt_boundary_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(opt_boundary_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(opt_boundary_)); opt_boundary_ = nullptr; } @@ -247,7 +247,7 @@ class mutual_information_backward : public testing::Test { if (p_) { VLOG(4) << "Destroy p_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(p_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(p_)); p_ = nullptr; } @@ -259,7 +259,7 @@ class mutual_information_backward : public testing::Test { if (ans_grad_) { VLOG(4) << "Destroy ans_grad_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(ans_grad_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(ans_grad_)); ans_grad_ = nullptr; } @@ -271,7 +271,7 @@ class mutual_information_backward : public testing::Test { if (px_grad_) { VLOG(4) << "Destroy px_grad_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(px_grad_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(px_grad_)); px_grad_ = nullptr; } @@ -283,13 +283,13 @@ class mutual_information_backward : public testing::Test { if (py_grad_) { VLOG(4) << "Destroy py_grad_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(py_grad_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(py_grad_)); py_grad_ = nullptr; } if (workspace_) { VLOG(4) << "Destroy workspace_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/mutual_information_backward/mutual_information_backward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/mutual_information_backward/mutual_information_backward_general.cpp index 0df6307aa..86b0d2dd1 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/mutual_information_backward/mutual_information_backward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/mutual_information_backward/mutual_information_backward_general.cpp @@ -52,10 +52,10 @@ class mutual_information_backward_general px_params.get_dim_nb(), px_params.get_dim_size().data())); if (mluOpGetTensorElementNum(px_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&px_, mluOpDataTypeBytes(px_params.get_dtype()) * 2)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&px_, mluOpDataTypeBytes(px_params.get_dtype()) * mluOpGetTensorElementNum(px_desc_))); } @@ -67,10 +67,10 @@ class mutual_information_backward_general py_params.get_dim_nb(), py_params.get_dim_size().data())); if (mluOpGetTensorElementNum(py_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&py_, mluOpDataTypeBytes(py_params.get_dtype()) * 2)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&py_, mluOpDataTypeBytes(py_params.get_dtype()) * mluOpGetTensorElementNum(py_desc_))); } @@ -83,13 +83,13 @@ class mutual_information_backward_general opt_boundary_params.get_dim_size().data())); if (mluOpGetTensorElementNum(opt_boundary_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &opt_boundary_, mluOpDataTypeBytes(opt_boundary_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&opt_boundary_, mluOpDataTypeBytes(opt_boundary_params.get_dtype()) * mluOpGetTensorElementNum(opt_boundary_desc_))); @@ -102,10 +102,10 @@ class mutual_information_backward_general p_params.get_dim_nb(), p_params.get_dim_size().data())); if (mluOpGetTensorElementNum(p_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&p_, mluOpDataTypeBytes(p_params.get_dtype()) * 2)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&p_, mluOpDataTypeBytes(p_params.get_dtype()) * mluOpGetTensorElementNum(p_desc_))); } @@ -118,11 +118,11 @@ class mutual_information_backward_general ans_grad_params.get_dim_size().data())); if (mluOpGetTensorElementNum(ans_grad_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&ans_grad_, mluOpDataTypeBytes(ans_grad_params.get_dtype()) * 2)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&ans_grad_, mluOpDataTypeBytes(ans_grad_params.get_dtype()) * mluOpGetTensorElementNum(ans_grad_desc_))); @@ -136,11 +136,11 @@ class mutual_information_backward_general px_grad_params.get_dim_size().data())); if (mluOpGetTensorElementNum(px_grad_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&px_grad_, mluOpDataTypeBytes(px_grad_params.get_dtype()) * 2)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&px_grad_, mluOpDataTypeBytes(px_grad_params.get_dtype()) * mluOpGetTensorElementNum(px_grad_desc_))); @@ -154,11 +154,11 @@ class mutual_information_backward_general py_grad_params.get_dim_size().data())); if (mluOpGetTensorElementNum(py_grad_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&py_grad_, mluOpDataTypeBytes(py_grad_params.get_dtype()) * 2)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&py_grad_, mluOpDataTypeBytes(py_grad_params.get_dtype()) * mluOpGetTensorElementNum(py_grad_desc_))); @@ -167,7 +167,7 @@ class mutual_information_backward_general target_device_ = std::get<7>(GetParam()); expected_status_ = std::get<8>(GetParam()); - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, MLUOP_DTYPE_FLOAT * workspace_size_)); } catch (const std::exception &e) { FAIL() << "MLUOPAPIGTEST: catched " << e.what() @@ -206,7 +206,7 @@ class mutual_information_backward_general if (px_) { VLOG(4) << "Destroy px_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(px_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(px_)); px_ = nullptr; } @@ -218,7 +218,7 @@ class mutual_information_backward_general if (py_) { VLOG(4) << "Destroy py_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(py_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(py_)); py_ = nullptr; } @@ -230,7 +230,7 @@ class mutual_information_backward_general if (opt_boundary_) { VLOG(4) << "Destroy opt_boundary_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(opt_boundary_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(opt_boundary_)); opt_boundary_ = nullptr; } @@ -242,7 +242,7 @@ class mutual_information_backward_general if (p_) { VLOG(4) << "Destroy p_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(p_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(p_)); p_ = nullptr; } @@ -254,7 +254,7 @@ class mutual_information_backward_general if (ans_grad_) { VLOG(4) << "Destroy ans_grad_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(ans_grad_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(ans_grad_)); ans_grad_ = nullptr; } @@ -266,7 +266,7 @@ class mutual_information_backward_general if (px_grad_) { VLOG(4) << "Destroy px_grad_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(px_grad_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(px_grad_)); px_grad_ = nullptr; } @@ -278,13 +278,13 @@ class mutual_information_backward_general if (py_) { VLOG(4) << "Destroy py_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(py_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(py_)); py_ = nullptr; } if (workspace_) { VLOG(4) << "Destroy workspace_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/mutual_information_forward/mutual_information_forward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/mutual_information_forward/mutual_information_forward.cpp index dd746fdfa..83d9971d0 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/mutual_information_forward/mutual_information_forward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/mutual_information_forward/mutual_information_forward.cpp @@ -51,11 +51,11 @@ class mutual_information_forward : public testing::Test { if (px) { if (px_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&px_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(px_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&px_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -70,11 +70,11 @@ class mutual_information_forward : public testing::Test { if (py) { if (py_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&py_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(py_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&py_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -89,12 +89,12 @@ class mutual_information_forward : public testing::Test { if (opt_boundary) { if (opt_boundary_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&opt_boundary_, MLUOP_DTYPE_INT64 * mluOpGetTensorElementNum( opt_boundary_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&opt_boundary_, MLUOP_DTYPE_INT64 * 2)); } } @@ -109,11 +109,11 @@ class mutual_information_forward : public testing::Test { if (p) { if (p_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&p_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(p_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&p_, MLUOP_DTYPE_FLOAT * 2)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&p_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -127,17 +127,17 @@ class mutual_information_forward : public testing::Test { if (ans) { if (ans_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&ans_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(ans_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&ans_, MLUOP_DTYPE_FLOAT * 2)); } } if (workspace) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, MLUOP_DTYPE_FLOAT * workspace_size_)); } } @@ -168,7 +168,7 @@ class mutual_information_forward : public testing::Test { if (px_) { VLOG(4) << "Destroy px_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(px_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(px_)); px_ = nullptr; } @@ -180,7 +180,7 @@ class mutual_information_forward : public testing::Test { if (py_) { VLOG(4) << "Destroy py_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(py_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(py_)); py_ = nullptr; } @@ -192,7 +192,7 @@ class mutual_information_forward : public testing::Test { if (opt_boundary_) { VLOG(4) << "Destroy opt_boundary_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(opt_boundary_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(opt_boundary_)); opt_boundary_ = nullptr; } @@ -204,7 +204,7 @@ class mutual_information_forward : public testing::Test { if (p_) { VLOG(4) << "Destroy p_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(p_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(p_)); p_ = nullptr; } @@ -216,13 +216,13 @@ class mutual_information_forward : public testing::Test { if (ans_) { VLOG(4) << "Destroy ans_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(ans_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(ans_)); ans_ = nullptr; } if (workspace_) { VLOG(4) << "Destroy workspace_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/mutual_information_forward/mutual_information_forward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/mutual_information_forward/mutual_information_forward_general.cpp index 468a82ce0..0d833f4ca 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/mutual_information_forward/mutual_information_forward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/mutual_information_forward/mutual_information_forward_general.cpp @@ -52,10 +52,10 @@ class mutual_information_forward_general px_params.get_dim_nb(), px_params.get_dim_size().data())); if (mluOpGetTensorElementNum(px_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&px_, mluOpDataTypeBytes(px_params.get_dtype()) * 2)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&px_, mluOpDataTypeBytes(px_params.get_dtype()) * mluOpGetTensorElementNum(px_desc_))); } @@ -67,10 +67,10 @@ class mutual_information_forward_general py_params.get_dim_nb(), py_params.get_dim_size().data())); if (mluOpGetTensorElementNum(py_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&py_, mluOpDataTypeBytes(py_params.get_dtype()) * 2)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&py_, mluOpDataTypeBytes(py_params.get_dtype()) * mluOpGetTensorElementNum(py_desc_))); } @@ -83,13 +83,13 @@ class mutual_information_forward_general opt_boundary_params.get_dim_size().data())); if (mluOpGetTensorElementNum(opt_boundary_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &opt_boundary_, mluOpDataTypeBytes(opt_boundary_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&opt_boundary_, mluOpDataTypeBytes(opt_boundary_params.get_dtype()) * mluOpGetTensorElementNum(opt_boundary_desc_))); @@ -102,10 +102,10 @@ class mutual_information_forward_general p_params.get_dim_nb(), p_params.get_dim_size().data())); if (mluOpGetTensorElementNum(p_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&p_, mluOpDataTypeBytes(p_params.get_dtype()) * 2)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&p_, mluOpDataTypeBytes(p_params.get_dtype()) * mluOpGetTensorElementNum(p_desc_))); } @@ -117,11 +117,11 @@ class mutual_information_forward_general ans_params.get_dim_nb(), ans_params.get_dim_size().data())); if (mluOpGetTensorElementNum(ans_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&ans_, mluOpDataTypeBytes(ans_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&ans_, mluOpDataTypeBytes(ans_params.get_dtype()) * mluOpGetTensorElementNum(ans_desc_))); } @@ -129,7 +129,7 @@ class mutual_information_forward_general target_device_ = std::get<5>(GetParam()); expected_status_ = std::get<6>(GetParam()); - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, MLUOP_DTYPE_FLOAT * workspace_size_)); } catch (const std::exception &e) { FAIL() << "MLUOPAPIGTEST: catched " << e.what() @@ -167,7 +167,7 @@ class mutual_information_forward_general if (px_) { VLOG(4) << "Destroy px_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(px_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(px_)); px_ = nullptr; } @@ -179,7 +179,7 @@ class mutual_information_forward_general if (py_) { VLOG(4) << "Destroy py_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(py_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(py_)); py_ = nullptr; } @@ -191,7 +191,7 @@ class mutual_information_forward_general if (opt_boundary_) { VLOG(4) << "Destroy opt_boundary_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(opt_boundary_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(opt_boundary_)); opt_boundary_ = nullptr; } @@ -203,7 +203,7 @@ class mutual_information_forward_general if (p_) { VLOG(4) << "Destroy p_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(p_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(p_)); p_ = nullptr; } @@ -215,13 +215,13 @@ class mutual_information_forward_general if (ans_) { VLOG(4) << "Destroy ans_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(ans_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(ans_)); ans_ = nullptr; } if (workspace_) { VLOG(4) << "Destroy workspace_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/nms_rotated/nms_rotated.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/nms_rotated/nms_rotated.cpp index 97ec3901c..cde109959 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/nms_rotated/nms_rotated.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/nms_rotated/nms_rotated.cpp @@ -51,7 +51,7 @@ class nms_rotated : public testing::Test { size_t b_ele_num = 2 * 5; size_t b_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t b_bytes = b_ele_num * b_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&boxes_, b_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&boxes_, b_bytes)); } if (scores_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&scores_desc_)); @@ -64,10 +64,10 @@ class nms_rotated : public testing::Test { size_t s_ele_num = 2; size_t s_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t s_bytes = s_ele_num * s_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&scores_, s_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&scores_, s_bytes)); } if (workspace) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)); } if (output_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&output_desc_)); @@ -80,11 +80,11 @@ class nms_rotated : public testing::Test { size_t o_ele_num = 2; size_t o_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_INT32); size_t o_bytes = o_ele_num * o_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&output_, o_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_, o_bytes)); } if (result_num) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&result_num_, mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } } @@ -110,7 +110,7 @@ class nms_rotated : public testing::Test { boxes_desc_ = NULL; } if (boxes_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(boxes_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(boxes_)); boxes_ = NULL; } if (scores_desc_) { @@ -118,11 +118,11 @@ class nms_rotated : public testing::Test { scores_desc_ = NULL; } if (scores_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(scores_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(scores_)); scores_ = NULL; } if (workspace_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = NULL; } if (output_desc_) { @@ -130,11 +130,11 @@ class nms_rotated : public testing::Test { output_desc_ = NULL; } if (output_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); output_ = NULL; } if (result_num_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(result_num_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(result_num_)); result_num_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/nms_rotated/nms_rotated_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/nms_rotated/nms_rotated_general.cpp index 42d6162c9..f1df34781 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/nms_rotated/nms_rotated_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/nms_rotated/nms_rotated_general.cpp @@ -57,9 +57,9 @@ class nms_rotated_general : public testing::TestWithParam { uint64_t b_bytes = mluOpDataTypeBytes(b_dtype) * b_ele_num; if (b_bytes > 0) { if (b_bytes < LARGE_TENSOR_NUM) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&boxes_, b_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&boxes_, b_bytes)) } else { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&boxes_, 8)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&boxes_, 8)) } } @@ -75,9 +75,9 @@ class nms_rotated_general : public testing::TestWithParam { uint64_t s_bytes = mluOpDataTypeBytes(s_dtype) * s_ele_num; if (s_bytes > 0) { if (s_bytes < LARGE_TENSOR_NUM) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&scores_, s_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&scores_, s_bytes)) } else { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&scores_, 8)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&scores_, 8)) } } @@ -93,13 +93,13 @@ class nms_rotated_general : public testing::TestWithParam { uint64_t o_bytes = mluOpDataTypeBytes(o_dtype) * o_ele_num; if (o_bytes > 0) { if (o_bytes < LARGE_TENSOR_NUM) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&output_, o_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_, o_bytes)) } else { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&output_, 8)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_, 8)) } } GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&result_num_, mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } @@ -117,10 +117,10 @@ class nms_rotated_general : public testing::TestWithParam { } if (workspace_size_ > 0) { if (workspace_size_ < LARGE_TENSOR_NUM) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, 8)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, 8)); } } status = mluOpNmsRotated(handle_, iou_threshold_, boxes_desc_, boxes_, @@ -142,7 +142,7 @@ class nms_rotated_general : public testing::TestWithParam { boxes_desc_ = NULL; } if (boxes_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(boxes_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(boxes_)); boxes_ = NULL; } if (scores_desc_) { @@ -150,11 +150,11 @@ class nms_rotated_general : public testing::TestWithParam { scores_desc_ = NULL; } if (scores_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(scores_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(scores_)); scores_ = NULL; } if (workspace_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = NULL; } if (output_desc_) { @@ -162,11 +162,11 @@ class nms_rotated_general : public testing::TestWithParam { output_desc_ = NULL; } if (output_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); output_ = NULL; } if (result_num_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(result_num_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(result_num_)); result_num_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/poly_nms/poly_nms.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/poly_nms/poly_nms.cpp index 929215e15..e19c47614 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/poly_nms/poly_nms.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/poly_nms/poly_nms.cpp @@ -50,10 +50,10 @@ class poly_nms : public testing::Test { size_t i_ele_num = 2 * 9; size_t i_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t i_bytes = i_ele_num * i_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&boxes_, i_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&boxes_, i_bytes)); } if (workspace) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)); } if (output_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&output_desc_)); @@ -66,11 +66,11 @@ class poly_nms : public testing::Test { size_t o_ele_num = 2; size_t o_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_INT32); size_t o_bytes = o_ele_num * o_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&output_, o_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_, o_bytes)); } if (result_num) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&result_num_, mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } } @@ -95,11 +95,11 @@ class poly_nms : public testing::Test { boxes_desc_ = NULL; } if (boxes_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(boxes_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(boxes_)); boxes_ = NULL; } if (workspace_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = NULL; } if (output_desc_) { @@ -107,11 +107,11 @@ class poly_nms : public testing::Test { output_desc_ = NULL; } if (output_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); output_ = NULL; } if (result_num_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(result_num_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(result_num_)); result_num_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/poly_nms/poly_nms_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/poly_nms/poly_nms_general.cpp index 66d1cb0b5..2a5185fb4 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/poly_nms/poly_nms_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/poly_nms/poly_nms_general.cpp @@ -56,7 +56,7 @@ class poly_nms_general : public testing::TestWithParam { uint64_t b_ele_num = mluOpGetTensorElementNum(boxes_desc_); uint64_t b_bytes = mluOpDataTypeBytes(b_dtype) * b_ele_num; if (b_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&boxes_, b_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&boxes_, b_bytes)) } MLUOP_CHECK(mluOpCreateTensorDescriptor(&output_desc_)); @@ -70,10 +70,10 @@ class poly_nms_general : public testing::TestWithParam { uint64_t o_ele_num = mluOpGetTensorElementNum(output_desc_); uint64_t o_bytes = mluOpDataTypeBytes(o_dtype) * o_ele_num; if (o_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&output_, o_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_, o_bytes)); } GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&result_num_, mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } @@ -89,7 +89,7 @@ class poly_nms_general : public testing::TestWithParam { destroy(); return status == expected_status_; } - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)); status = mluOpPolyNms(handle_, boxes_desc_, boxes_, iou_threshold_, workspace_, workspace_size_, output_desc_, output_, result_num_); @@ -109,11 +109,11 @@ class poly_nms_general : public testing::TestWithParam { boxes_desc_ = NULL; } if (boxes_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(boxes_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(boxes_)); boxes_ = NULL; } if (workspace_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = NULL; } if (output_desc_) { @@ -121,11 +121,11 @@ class poly_nms_general : public testing::TestWithParam { output_desc_ = NULL; } if (output_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); output_ = NULL; } if (result_num_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(result_num_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(result_num_)); result_num_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/prior_box/prior_box.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/prior_box/prior_box.cpp index c0d24bc6e..da2fec41a 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/prior_box/prior_box.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/prior_box/prior_box.cpp @@ -50,7 +50,7 @@ class prior_box : public testing::Test { dim_size.data())); } if (min) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&min_, 8)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&min_, 8)); } if (aspect_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&aspect_ratios_desc_)); @@ -60,7 +60,7 @@ class prior_box : public testing::Test { MLUOP_DTYPE_FLOAT, 1, dim_size.data())); } if (aspect) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&aspect_ratios_, 8)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&aspect_ratios_, 8)); } if (variance_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&variance_desc_)); @@ -70,7 +70,7 @@ class prior_box : public testing::Test { dim_size.data())); } if (variance) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&variance_, 8)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&variance_, 8)); } if (max_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&max_desc_)); @@ -80,7 +80,7 @@ class prior_box : public testing::Test { dim_size.data())); } if (max) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&max_, 8)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&max_, 8)); } if (output_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&output_desc_)); @@ -90,7 +90,7 @@ class prior_box : public testing::Test { dim_size.data())); } if (output) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&output_, 8)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_, 8)); } if (var_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&var_desc_)); @@ -100,7 +100,7 @@ class prior_box : public testing::Test { dim_size.data())); } if (var) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&var_, 8)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&var_, 8)); } } @@ -147,27 +147,27 @@ class prior_box : public testing::Test { var_desc_ = NULL; } if (min_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(min_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(min_)); min_ = NULL; } if (aspect_ratios_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(aspect_ratios_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(aspect_ratios_)); aspect_ratios_ = NULL; } if (variance_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(variance_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(variance_)); variance_ = NULL; } if (max_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(max_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(max_)); max_ = NULL; } if (output_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); output_ = NULL; } if (var_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(var_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(var_)); var_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/prior_box/prior_box_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/prior_box/prior_box_general.cpp index 0739cafe5..4c55425b2 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/prior_box/prior_box_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/prior_box/prior_box_general.cpp @@ -60,7 +60,7 @@ class prior_box_general : public testing::TestWithParam { min_dim, min_dim_size.data())); const uint64_t min_ele_num = mluOpGetTensorElementNum(min_desc_); if (min_ele_num > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&min_, 8)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&min_, 8)) } MLUOP_CHECK(mluOpCreateTensorDescriptor(&aspect_ratios_desc_)); @@ -75,7 +75,7 @@ class prior_box_general : public testing::TestWithParam { const uint64_t aspect_ele_num = mluOpGetTensorElementNum(aspect_ratios_desc_); if (aspect_ele_num > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&aspect_ratios_, 8)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&aspect_ratios_, 8)); } MLUOP_CHECK(mluOpCreateTensorDescriptor(&variance_desc_)); @@ -89,7 +89,7 @@ class prior_box_general : public testing::TestWithParam { variance_dim_size.data())); const uint64_t variance_ele_num = mluOpGetTensorElementNum(variance_desc_); if (variance_ele_num > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&variance_, 8)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&variance_, 8)); } MLUOP_CHECK(mluOpCreateTensorDescriptor(&max_desc_)); @@ -102,7 +102,7 @@ class prior_box_general : public testing::TestWithParam { max_dim, max_dim_size.data())); const uint64_t max_ele_num = mluOpGetTensorElementNum(max_desc_); if (max_ele_num > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&max_, 8)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&max_, 8)); } MLUOP_CHECK(mluOpCreateTensorDescriptor(&output_desc_)); @@ -115,7 +115,7 @@ class prior_box_general : public testing::TestWithParam { o_dim_size.data())); const uint64_t o_ele_num = mluOpGetTensorElementNum(output_desc_); if (o_ele_num > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&output_, 8)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_, 8)); } MLUOP_CHECK(mluOpCreateTensorDescriptor(&var_desc_)); @@ -128,7 +128,7 @@ class prior_box_general : public testing::TestWithParam { var_dim, var_dim_size.data())); const uint64_t var_ele_num = mluOpGetTensorElementNum(var_desc_); if (var_ele_num > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&var_, 8)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&var_, 8)); } PriorBoxDescParam params = std::get<6>(GetParam()); @@ -184,27 +184,27 @@ class prior_box_general : public testing::TestWithParam { var_desc_ = NULL; } if (min_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(min_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(min_)); min_ = NULL; } if (aspect_ratios_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(aspect_ratios_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(aspect_ratios_)); aspect_ratios_ = NULL; } if (variance_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(variance_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(variance_)); variance_ = NULL; } if (max_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(max_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(max_)); max_ = NULL; } if (output_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); output_ = NULL; } if (var_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(var_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(var_)); var_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/psroipool_backward/psroipool_backward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/psroipool_backward/psroipool_backward.cpp index f00dcf79e..ff63f3385 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/psroipool_backward/psroipool_backward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/psroipool_backward/psroipool_backward.cpp @@ -52,7 +52,7 @@ class psroipool_backward : public testing::Test { size_t b_ele_num = 1 * 5 * 5 * 9; size_t b_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t b_bytes = b_ele_num * b_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&bottom_grad_, b_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&bottom_grad_, b_bytes)); } if (rois_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&rois_desc_)); @@ -64,7 +64,7 @@ class psroipool_backward : public testing::Test { size_t r_ele_num = 1 * 5; size_t r_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t r_bytes = r_ele_num * r_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&rois_, r_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&rois_, r_bytes)); } if (top_grad_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&top_grad_desc_)); @@ -77,7 +77,7 @@ class psroipool_backward : public testing::Test { size_t o_ele_num = 1 * 3 * 3 * 1; size_t o_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t o_bytes = o_ele_num * o_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&top_grad_, o_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&top_grad_, o_bytes)); } if (mapping_channel_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&mapping_channel_desc_)); @@ -90,7 +90,7 @@ class psroipool_backward : public testing::Test { size_t m_ele_num = 1 * 3 * 3 * 1; size_t m_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_INT32); size_t m_bytes = m_ele_num * m_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&mapping_channel_, m_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&mapping_channel_, m_bytes)); } } @@ -115,7 +115,7 @@ class psroipool_backward : public testing::Test { bottom_grad_desc_ = NULL; } if (bottom_grad_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(bottom_grad_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(bottom_grad_)); bottom_grad_ = NULL; } if (rois_desc_) { @@ -123,7 +123,7 @@ class psroipool_backward : public testing::Test { rois_desc_ = NULL; } if (rois_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(rois_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(rois_)); rois_ = NULL; } if (top_grad_desc_) { @@ -131,7 +131,7 @@ class psroipool_backward : public testing::Test { top_grad_desc_ = NULL; } if (top_grad_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(top_grad_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(top_grad_)); top_grad_ = NULL; } if (mapping_channel_desc_) { @@ -139,7 +139,7 @@ class psroipool_backward : public testing::Test { mapping_channel_desc_ = NULL; } if (mapping_channel_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(mapping_channel_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(mapping_channel_)); mapping_channel_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/psroipool_backward/psroipool_backward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/psroipool_backward/psroipool_backward_general.cpp index c57a07ded..360d1916f 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/psroipool_backward/psroipool_backward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/psroipool_backward/psroipool_backward_general.cpp @@ -64,7 +64,7 @@ class psroipool_backward_general uint64_t b_ele_num = mluOpGetTensorElementNum(bottom_grad_desc_); uint64_t b_bytes = mluOpDataTypeBytes(b_dtype) * b_ele_num; if (b_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&bottom_grad_, b_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&bottom_grad_, b_bytes)) } MLUOpTensorParam r_params = std::get<2>(GetParam()); @@ -78,7 +78,7 @@ class psroipool_backward_general uint64_t r_ele_num = mluOpGetTensorElementNum(rois_desc_); uint64_t r_bytes = mluOpDataTypeBytes(r_dtype) * r_ele_num; if (r_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&rois_, r_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&rois_, r_bytes)) } MLUOpTensorParam o_params = std::get<3>(GetParam()); @@ -92,7 +92,7 @@ class psroipool_backward_general uint64_t o_ele_num = mluOpGetTensorElementNum(top_grad_desc_); uint64_t o_bytes = mluOpDataTypeBytes(o_dtype) * o_ele_num; if (o_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&top_grad_, o_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&top_grad_, o_bytes)) } MLUOpTensorParam m_params = std::get<4>(GetParam()); @@ -106,7 +106,7 @@ class psroipool_backward_general uint64_t m_ele_num = mluOpGetTensorElementNum(mapping_channel_desc_); uint64_t m_bytes = mluOpDataTypeBytes(m_dtype) * m_ele_num; if (m_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&mapping_channel_, m_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&mapping_channel_, m_bytes)) } } @@ -136,7 +136,7 @@ class psroipool_backward_general bottom_grad_desc_ = NULL; } if (bottom_grad_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(bottom_grad_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(bottom_grad_)); bottom_grad_ = NULL; } if (rois_desc_) { @@ -144,7 +144,7 @@ class psroipool_backward_general rois_desc_ = NULL; } if (rois_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(rois_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(rois_)); rois_ = NULL; } if (top_grad_desc_) { @@ -152,7 +152,7 @@ class psroipool_backward_general top_grad_desc_ = NULL; } if (top_grad_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(top_grad_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(top_grad_)); top_grad_ = NULL; } if (mapping_channel_desc_) { @@ -160,7 +160,7 @@ class psroipool_backward_general mapping_channel_desc_ = NULL; } if (mapping_channel_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(mapping_channel_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(mapping_channel_)); mapping_channel_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/psroipool_forward/psroipool_forward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/psroipool_forward/psroipool_forward.cpp index c42a463d5..ae8bda9bb 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/psroipool_forward/psroipool_forward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/psroipool_forward/psroipool_forward.cpp @@ -51,7 +51,7 @@ class psroipool_forward : public testing::Test { size_t i_ele_num = 1 * 5 * 5 * 9; size_t i_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t i_bytes = i_ele_num * i_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&input_, i_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_, i_bytes)); } if (rois_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&rois_desc_)); @@ -63,7 +63,7 @@ class psroipool_forward : public testing::Test { size_t r_ele_num = 1 * 5; size_t r_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t r_bytes = r_ele_num * r_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&rois_, r_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&rois_, r_bytes)); } if (output_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&output_desc_)); @@ -76,7 +76,7 @@ class psroipool_forward : public testing::Test { size_t o_ele_num = 1 * 3 * 3 * 1; size_t o_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t o_bytes = o_ele_num * o_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&output_, o_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_, o_bytes)); } if (mapping_channel_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&mapping_channel_desc_)); @@ -89,7 +89,7 @@ class psroipool_forward : public testing::Test { size_t m_ele_num = 1 * 3 * 3 * 1; size_t m_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_INT32); size_t m_bytes = m_ele_num * m_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&mapping_channel_, m_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&mapping_channel_, m_bytes)); } } @@ -114,7 +114,7 @@ class psroipool_forward : public testing::Test { input_desc_ = NULL; } if (input_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); input_ = NULL; } if (rois_desc_) { @@ -122,7 +122,7 @@ class psroipool_forward : public testing::Test { rois_desc_ = NULL; } if (rois_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(rois_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(rois_)); rois_ = NULL; } if (output_desc_) { @@ -130,7 +130,7 @@ class psroipool_forward : public testing::Test { output_desc_ = NULL; } if (output_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); output_ = NULL; } if (mapping_channel_desc_) { @@ -138,7 +138,7 @@ class psroipool_forward : public testing::Test { mapping_channel_desc_ = NULL; } if (mapping_channel_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(mapping_channel_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(mapping_channel_)); mapping_channel_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/psroipool_forward/psroipool_forward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/psroipool_forward/psroipool_forward_general.cpp index 2186bd59a..8f7bb2aef 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/psroipool_forward/psroipool_forward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/psroipool_forward/psroipool_forward_general.cpp @@ -64,7 +64,7 @@ class psroipool_forward_general uint64_t i_ele_num = mluOpGetTensorElementNum(input_desc_); uint64_t i_bytes = mluOpDataTypeBytes(i_dtype) * i_ele_num; if (i_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&input_, i_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_, i_bytes)) } MLUOpTensorParam r_params = std::get<2>(GetParam()); @@ -78,7 +78,7 @@ class psroipool_forward_general uint64_t r_ele_num = mluOpGetTensorElementNum(rois_desc_); uint64_t r_bytes = mluOpDataTypeBytes(r_dtype) * r_ele_num; if (r_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&rois_, r_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&rois_, r_bytes)) } MLUOpTensorParam o_params = std::get<3>(GetParam()); @@ -92,7 +92,7 @@ class psroipool_forward_general uint64_t o_ele_num = mluOpGetTensorElementNum(output_desc_); uint64_t o_bytes = mluOpDataTypeBytes(o_dtype) * o_ele_num; if (o_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&output_, o_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_, o_bytes)) } MLUOpTensorParam m_params = std::get<4>(GetParam()); @@ -106,7 +106,7 @@ class psroipool_forward_general uint64_t m_ele_num = mluOpGetTensorElementNum(mapping_channel_desc_); uint64_t m_bytes = mluOpDataTypeBytes(m_dtype) * m_ele_num; if (m_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&mapping_channel_, m_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&mapping_channel_, m_bytes)) } } @@ -136,7 +136,7 @@ class psroipool_forward_general input_desc_ = NULL; } if (input_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); input_ = NULL; } if (rois_desc_) { @@ -144,7 +144,7 @@ class psroipool_forward_general rois_desc_ = NULL; } if (rois_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(rois_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(rois_)); rois_ = NULL; } if (output_desc_) { @@ -152,7 +152,7 @@ class psroipool_forward_general output_desc_ = NULL; } if (output_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); output_ = NULL; } if (mapping_channel_desc_) { @@ -160,7 +160,7 @@ class psroipool_forward_general mapping_channel_desc_ = NULL; } if (mapping_channel_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(mapping_channel_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(mapping_channel_)); mapping_channel_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/roi_crop_backward/roi_crop_backward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/roi_crop_backward/roi_crop_backward.cpp index 00e036f9c..fcbbd7b43 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/roi_crop_backward/roi_crop_backward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/roi_crop_backward/roi_crop_backward.cpp @@ -65,19 +65,19 @@ class roi_crop_backward : public testing::Test { size_t o_ele_num = 1 * 7 * 7 * 9; size_t o_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t o_bytes = o_ele_num * o_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&grad_output_, o_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_output_, o_bytes)); } if (grid) { size_t g_ele_num = 1 * 7 * 7 * 2; size_t g_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t g_bytes = g_ele_num * g_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&grid_, g_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grid_, g_bytes)); } if (grad_input) { size_t i_ele_num = 1 * 7 * 7 * 9; size_t i_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t i_bytes = i_ele_num * i_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&grad_input_, i_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_input_, i_bytes)); } } @@ -110,15 +110,15 @@ class roi_crop_backward : public testing::Test { grad_input_desc_ = NULL; } if (grad_input_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_input_)); grad_input_ = NULL; } if (grid_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grid_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grid_)); grid_ = NULL; } if (grad_output_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_output_)); grad_output_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/roi_crop_backward/roi_crop_backward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/roi_crop_backward/roi_crop_backward_general.cpp index 0811a0e78..fd985e91f 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/roi_crop_backward/roi_crop_backward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/roi_crop_backward/roi_crop_backward_general.cpp @@ -58,7 +58,7 @@ class roi_crop_backward_general uint64_t o_ele_num = mluOpGetTensorElementNum(grad_output_desc_); uint64_t o_bytes = mluOpDataTypeBytes(o_dtype) * o_ele_num; if (o_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&grad_output_, o_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_output_, o_bytes)); } MLUOP_CHECK(mluOpCreateTensorDescriptor(&grid_desc_)); @@ -72,7 +72,7 @@ class roi_crop_backward_general uint64_t g_ele_num = mluOpGetTensorElementNum(grid_desc_); uint64_t g_bytes = mluOpDataTypeBytes(g_dtype) * g_ele_num; if (g_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&grid_, g_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grid_, g_bytes)); } MLUOP_CHECK(mluOpCreateTensorDescriptor(&grad_input_desc_)); @@ -86,7 +86,7 @@ class roi_crop_backward_general uint64_t i_ele_num = mluOpGetTensorElementNum(grad_input_desc_); uint64_t i_bytes = mluOpDataTypeBytes(i_dtype) * i_ele_num; if (i_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&grad_input_, i_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_input_, i_bytes)) } } @@ -124,15 +124,15 @@ class roi_crop_backward_general grad_input_desc_ = NULL; } if (grad_output_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_output_)); grad_output_ = NULL; } if (grid_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grid_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grid_)); grid_ = NULL; } if (grad_input_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_input_)); grad_input_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/roi_crop_forward/roi_crop_forward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/roi_crop_forward/roi_crop_forward.cpp index 175e6400c..a0bc13073 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/roi_crop_forward/roi_crop_forward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/roi_crop_forward/roi_crop_forward.cpp @@ -63,19 +63,19 @@ class roi_crop_forward : public testing::Test { size_t i_ele_num = 1 * 7 * 7 * 9; size_t i_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t i_bytes = i_ele_num * i_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&input_, i_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_, i_bytes)); } if (grid) { size_t g_ele_num = 1 * 7 * 7 * 2; size_t g_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t g_bytes = g_ele_num * g_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&grid_, g_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grid_, g_bytes)); } if (output) { size_t o_ele_num = 1 * 7 * 7 * 9; size_t o_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t o_bytes = o_ele_num * o_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&output_, o_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_, o_bytes)); } } @@ -107,15 +107,15 @@ class roi_crop_forward : public testing::Test { output_desc_ = NULL; } if (input_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); input_ = NULL; } if (grid_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grid_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grid_)); grid_ = NULL; } if (output_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); output_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/roi_crop_forward/roi_crop_forward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/roi_crop_forward/roi_crop_forward_general.cpp index 105c651de..f8cb9c425 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/roi_crop_forward/roi_crop_forward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/roi_crop_forward/roi_crop_forward_general.cpp @@ -60,7 +60,7 @@ class roi_crop_forward_general uint64_t i_ele_num = mluOpGetTensorElementNum(input_desc_); uint64_t i_bytes = mluOpDataTypeBytes(i_dtype) * i_ele_num; if (i_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&input_, i_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_, i_bytes)); } MLUOP_CHECK(mluOpCreateTensorDescriptor(&grid_desc_)); @@ -74,7 +74,7 @@ class roi_crop_forward_general uint64_t g_ele_num = mluOpGetTensorElementNum(grid_desc_); uint64_t g_bytes = mluOpDataTypeBytes(g_dtype) * g_ele_num; if (g_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&grid_, g_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grid_, g_bytes)); } MLUOP_CHECK(mluOpCreateTensorDescriptor(&output_desc_)); @@ -88,7 +88,7 @@ class roi_crop_forward_general uint64_t o_ele_num = mluOpGetTensorElementNum(output_desc_); uint64_t o_bytes = mluOpDataTypeBytes(o_dtype) * o_ele_num; if (o_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&output_, o_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_, o_bytes)); } } @@ -125,15 +125,15 @@ class roi_crop_forward_general output_desc_ = NULL; } if (input_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); input_ = NULL; } if (grid_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grid_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grid_)); grid_ = NULL; } if (output_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); output_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/roiaware_pool3d_forward/roiaware_pool3d_forward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/roiaware_pool3d_forward/roiaware_pool3d_forward.cpp index 4a16552d8..d35f8107a 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/roiaware_pool3d_forward/roiaware_pool3d_forward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/roiaware_pool3d_forward/roiaware_pool3d_forward.cpp @@ -54,12 +54,12 @@ class roiaware_pool3d_forward : public testing::Test { if (rois) { if (rois_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&rois_, mluOpGetTensorElementNum(rois_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&rois_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } } @@ -75,12 +75,12 @@ class roiaware_pool3d_forward : public testing::Test { if (pts) { if (pts_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&pts_, mluOpGetTensorElementNum(pts_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&pts_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } } @@ -95,12 +95,12 @@ class roiaware_pool3d_forward : public testing::Test { if (pts_feature) { if (pts_feature_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&pts_feature_, mluOpGetTensorElementNum(pts_feature_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&pts_feature_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } @@ -116,12 +116,12 @@ class roiaware_pool3d_forward : public testing::Test { if (pooled_features) { if (pooled_features_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&pooled_features_, mluOpGetTensorElementNum(pooled_features_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&pooled_features_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } @@ -138,12 +138,12 @@ class roiaware_pool3d_forward : public testing::Test { if (argmax) { if (argmax_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&argmax_, mluOpGetTensorElementNum(argmax_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&argmax_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } } @@ -159,19 +159,19 @@ class roiaware_pool3d_forward : public testing::Test { if (pts_idx_of_voxels) { if (pts_idx_of_voxels_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&pts_idx_of_voxels_, mluOpGetTensorElementNum(pts_idx_of_voxels_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&pts_idx_of_voxels_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } } if (worksapce) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)); } } @@ -203,7 +203,7 @@ class roiaware_pool3d_forward : public testing::Test { if (rois_) { VLOG(4) << "Destroy rois"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(rois_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(rois_)); rois_ = nullptr; } @@ -215,7 +215,7 @@ class roiaware_pool3d_forward : public testing::Test { if (pts_) { VLOG(4) << "Destroy pts"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(pts_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(pts_)); pts_ = nullptr; } @@ -227,13 +227,13 @@ class roiaware_pool3d_forward : public testing::Test { if (pts_feature_) { VLOG(4) << "Destroy pts_feature"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(pts_feature_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(pts_feature_)); pts_feature_ = nullptr; } if (workspace_) { VLOG(4) << "Destroy workspace"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } @@ -245,7 +245,7 @@ class roiaware_pool3d_forward : public testing::Test { if (argmax_) { VLOG(4) << "Destroy argmax"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(argmax_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(argmax_)); argmax_ = nullptr; } @@ -257,7 +257,7 @@ class roiaware_pool3d_forward : public testing::Test { if (pts_idx_of_voxels_) { VLOG(4) << "Destroy pts_idx_of_voxels"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(pts_idx_of_voxels_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(pts_idx_of_voxels_)); pts_idx_of_voxels_ = nullptr; } @@ -269,7 +269,7 @@ class roiaware_pool3d_forward : public testing::Test { if (pooled_features_) { VLOG(4) << "Destroy pooled_features"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(pooled_features_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(pooled_features_)); pooled_features_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/roiaware_pool3d_forward/roiaware_pool3d_forward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/roiaware_pool3d_forward/roiaware_pool3d_forward_general.cpp index cc2c8f062..2243be2b0 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/roiaware_pool3d_forward/roiaware_pool3d_forward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/roiaware_pool3d_forward/roiaware_pool3d_forward_general.cpp @@ -53,7 +53,7 @@ class roiaware_pool3d_forward_general rois_desc_, rois_params.get_layout(), rois_params.get_dtype(), rois_params.get_dim_nb(), rois_params.get_dim_size().data())); GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&rois_, mluOpDataTypeBytes(rois_params.get_dtype()) * 10)); MLUOpTensorParam pts_params = std::get<1>(GetParam()); @@ -62,7 +62,7 @@ class roiaware_pool3d_forward_general pts_desc_, pts_params.get_layout(), pts_params.get_dtype(), pts_params.get_dim_nb(), pts_params.get_dim_size().data())); GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&pts_, mluOpDataTypeBytes(pts_params.get_dtype()) * 10)); MLUOpTensorParam pts_feature_params = std::get<2>(GetParam()); @@ -72,7 +72,7 @@ class roiaware_pool3d_forward_general pts_feature_params.get_dtype(), pts_feature_params.get_dim_nb(), pts_feature_params.get_dim_size().data())); GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&pts_feature_, mluOpDataTypeBytes(pts_feature_params.get_dtype()) * 10)); @@ -84,7 +84,7 @@ class roiaware_pool3d_forward_general pooled_features_params.get_dim_nb(), pooled_features_params.get_dim_size().data())); GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &pooled_features_, mluOpDataTypeBytes(pooled_features_params.get_dtype()) * 10)); @@ -95,7 +95,7 @@ class roiaware_pool3d_forward_general argmax_desc_, argmax_params.get_layout(), argmax_params.get_dtype(), argmax_params.get_dim_nb(), argmax_params.get_dim_size().data())); GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&argmax_, mluOpDataTypeBytes(argmax_params.get_dtype()) * 10)); @@ -107,7 +107,7 @@ class roiaware_pool3d_forward_general pts_idx_of_voxels_params.get_dim_nb(), pts_idx_of_voxels_params.get_dim_size().data())); GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &pts_idx_of_voxels_, mluOpDataTypeBytes(pts_idx_of_voxels_params.get_dtype()) * 10)); @@ -137,7 +137,7 @@ class roiaware_pool3d_forward_general destroy(); return expected_status_ == status; } - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)); status = mluOpRoiAwarePool3dForward( handle_, pool_method_, boxes_num_, pts_num_, channels_, rois_desc_, rois_, pts_desc_, pts_, pts_feature_desc_, pts_feature_, workspace_, @@ -162,7 +162,7 @@ class roiaware_pool3d_forward_general } if (rois_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(rois_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(rois_)); rois_ = nullptr; } @@ -172,7 +172,7 @@ class roiaware_pool3d_forward_general } if (pts_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(pts_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(pts_)); pts_ = nullptr; } @@ -182,12 +182,12 @@ class roiaware_pool3d_forward_general } if (pts_feature_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(pts_feature_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(pts_feature_)); pts_feature_ = nullptr; } if (workspace_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } @@ -197,7 +197,7 @@ class roiaware_pool3d_forward_general } if (argmax_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(argmax_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(argmax_)); argmax_ = nullptr; } @@ -207,7 +207,7 @@ class roiaware_pool3d_forward_general } if (pts_idx_of_voxels_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(pts_idx_of_voxels_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(pts_idx_of_voxels_)); pts_idx_of_voxels_ = nullptr; } @@ -217,7 +217,7 @@ class roiaware_pool3d_forward_general } if (pooled_features_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(pooled_features_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(pooled_features_)); pooled_features_ = nullptr; } } catch (const std::exception &e) { diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/roipoint_pool3d/roipoint_pool3d.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/roipoint_pool3d/roipoint_pool3d.cpp index 820196cca..9642e8ff5 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/roipoint_pool3d/roipoint_pool3d.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/roipoint_pool3d/roipoint_pool3d.cpp @@ -54,12 +54,12 @@ class roipoint_pool3d : public testing::Test { if (points) { if (points_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&points_, mluOpGetTensorElementNum(points_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&points_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } } @@ -74,12 +74,12 @@ class roipoint_pool3d : public testing::Test { if (point_features) { if (point_features_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&point_features_, mluOpGetTensorElementNum(point_features_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&point_features_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } @@ -96,12 +96,12 @@ class roipoint_pool3d : public testing::Test { if (boxes3d) { if (boxes3d_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&boxes3d_, mluOpGetTensorElementNum(boxes3d_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&boxes3d_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } } @@ -116,12 +116,12 @@ class roipoint_pool3d : public testing::Test { if (pooled_features) { if (pooled_features_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&pooled_features_, mluOpGetTensorElementNum(pooled_features_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&pooled_features_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT))); } @@ -138,19 +138,19 @@ class roipoint_pool3d : public testing::Test { if (pooled_empty_flag) { if (pooled_empty_flag_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&pooled_empty_flag_, mluOpGetTensorElementNum(pooled_empty_flag_desc_) * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&pooled_empty_flag_, 64 * mluOpDataTypeBytes(MLUOP_DTYPE_INT32))); } } if (workspace) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)); } } mluOpStatus_t compute() { @@ -181,7 +181,7 @@ class roipoint_pool3d : public testing::Test { if (points_) { VLOG(4) << "Destroy points_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(points_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(points_)); points_ = nullptr; } @@ -193,7 +193,7 @@ class roipoint_pool3d : public testing::Test { if (point_features_) { VLOG(4) << "Destroy point_features_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(point_features_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(point_features_)); point_features_ = nullptr; } @@ -205,7 +205,7 @@ class roipoint_pool3d : public testing::Test { if (boxes3d_) { VLOG(4) << "Destroy boxes3d_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(boxes3d_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(boxes3d_)); boxes3d_ = nullptr; } @@ -217,7 +217,7 @@ class roipoint_pool3d : public testing::Test { if (pooled_features_) { VLOG(4) << "Destroy pooled_features_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(pooled_features_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(pooled_features_)); pooled_features_ = nullptr; } @@ -229,13 +229,13 @@ class roipoint_pool3d : public testing::Test { if (pooled_empty_flag_) { VLOG(4) << "Destroy pooled_empty_flag_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(pooled_empty_flag_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(pooled_empty_flag_)); pooled_empty_flag_ = nullptr; } if (workspace_) { VLOG(4) << "Destroy workspace"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/roipoint_pool3d/roipoint_pool3d_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/roipoint_pool3d/roipoint_pool3d_general.cpp index 49c3dd5ee..03efa9d85 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/roipoint_pool3d/roipoint_pool3d_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/roipoint_pool3d/roipoint_pool3d_general.cpp @@ -65,12 +65,12 @@ class roipoint_pool3d_general points_params.get_dim_nb(), points_params.get_dim_size().data())); if (mluOpGetTensorElementNum(points_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&points_, mluOpDataTypeBytes(points_params.get_dtype()) * 10)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&points_, mluOpDataTypeBytes(points_params.get_dtype()) * mluOpGetTensorElementNum(points_desc_))); } @@ -83,13 +83,13 @@ class roipoint_pool3d_general point_features_params.get_dim_size().data())); if (mluOpGetTensorElementNum(point_features_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &point_features_, mluOpDataTypeBytes(point_features_params.get_dtype()) * 10)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&point_features_, mluOpDataTypeBytes(point_features_params.get_dtype()) * mluOpGetTensorElementNum(point_features_desc_))); @@ -104,11 +104,11 @@ class roipoint_pool3d_general if (mluOpGetTensorElementNum(boxes3d_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&boxes3d_, mluOpDataTypeBytes(boxes3d_params.get_dtype()) * 10)); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&boxes3d_, mluOpDataTypeBytes(boxes3d_params.get_dtype()) * mluOpGetTensorElementNum(boxes3d_desc_))); @@ -123,13 +123,13 @@ class roipoint_pool3d_general pooled_features_params.get_dim_size().data())); if (mluOpGetTensorElementNum(pooled_features_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &pooled_features_, mluOpDataTypeBytes(pooled_features_params.get_dtype()) * 10)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&pooled_features_, mluOpDataTypeBytes(pooled_features_params.get_dtype()) * mluOpGetTensorElementNum(pooled_features_desc_))); @@ -145,13 +145,13 @@ class roipoint_pool3d_general if (mluOpGetTensorElementNum(pooled_empty_flag_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &pooled_empty_flag_, mluOpDataTypeBytes(pooled_empty_flag_params.get_dtype()) * 10)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc( &pooled_empty_flag_, mluOpDataTypeBytes(pooled_empty_flag_params.get_dtype()) * @@ -178,7 +178,7 @@ class roipoint_pool3d_general destroy(); return expected_status_ == status; } - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)); status = mluOpRoiPointPool3d( handle_, batch_size_, pts_num_, boxes_num_, feature_in_len_, @@ -204,7 +204,7 @@ class roipoint_pool3d_general if (points_) { VLOG(4) << "Destroy points_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(points_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(points_)); points_ = nullptr; } @@ -216,7 +216,7 @@ class roipoint_pool3d_general if (point_features_) { VLOG(4) << "Destroy point_features_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(point_features_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(point_features_)); point_features_ = nullptr; } @@ -228,7 +228,7 @@ class roipoint_pool3d_general if (boxes3d_) { VLOG(4) << "Destroy boxes3d_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(boxes3d_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(boxes3d_)); boxes3d_ = nullptr; } @@ -240,7 +240,7 @@ class roipoint_pool3d_general if (pooled_features_) { VLOG(4) << "Destroy pooled_features_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(pooled_features_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(pooled_features_)); pooled_features_ = nullptr; } @@ -252,7 +252,7 @@ class roipoint_pool3d_general if (pooled_empty_flag_) { VLOG(4) << "Destroy pooled_empty_flag_"; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(pooled_empty_flag_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(pooled_empty_flag_)); pooled_empty_flag_ = nullptr; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/test_env.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/test_env.cpp index 4e030d214..b0f29aeee 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/test_env.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/test_env.cpp @@ -28,7 +28,7 @@ void TestEnvironment::SetUp() { // 2. get device num unsigned int dev_num = 0; - ASSERT_EQ(cnrtGetDeviceCount(&dev_num), CNRT_RET_SUCCESS); + ASSERT_EQ(cnrtGetDeviceCount(&dev_num), cnrtSuccess); if (dev_num <= 0) { // dev_num_ should > 0 FAIL() << "Can't find device"; } else { @@ -43,8 +43,8 @@ void TestEnvironment::SetUp() { // cnrt set current device using CNRT_DEFAULT_DEVICE // in cnrtGetDevice() CNRT_DEFAULT_DEVICE > id VLOG(4) << "Set current device as device: " << dev_id; - ASSERT_EQ(cnrtGetDevice(&dev_id), CNRT_RET_SUCCESS); - ASSERT_EQ(cnrtSetDevice(dev_id), CNRT_RET_SUCCESS); + ASSERT_EQ(cnrtGetDevice(&dev_id), cnrtSuccess); + ASSERT_EQ(cnrtSetDevice(dev_id), cnrtSuccess); } void TestEnvironment::TearDown() { diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/three_interpolate_backward/three_interpolate_backward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/three_interpolate_backward/three_interpolate_backward.cpp index 04c228b8a..eac7ea29c 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/three_interpolate_backward/three_interpolate_backward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/three_interpolate_backward/three_interpolate_backward.cpp @@ -71,27 +71,27 @@ class three_interprolate_backward : public testing::Test { size_t grad_output_ele_num = 1 * 2 * 4; size_t grad_output_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t grad_output_bytes = grad_output_ele_num * grad_output_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_output_, grad_output_bytes)); } if (indices) { size_t indices_ele_num = 1 * 4 * 3; size_t indices_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_INT32); size_t indices_bytes = indices_ele_num * indices_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&indices_, indices_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indices_, indices_bytes)); } if (weight) { size_t weight_ele_num = 1 * 4 * 3; size_t weight_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_INT32); size_t weight_bytes = weight_ele_num * weight_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&weight_, weight_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&weight_, weight_bytes)); } if (grad_features) { size_t grad_features_ele_num = 1 * 2 * 4; size_t grad_features_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_INT32); size_t grad_features_bytes = grad_features_ele_num * grad_features_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_features_, grad_features_bytes)); } } @@ -129,19 +129,19 @@ class three_interprolate_backward : public testing::Test { grad_features_desc_ = NULL; } if (grad_output_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_output_)); grad_output_ = NULL; } if (indices_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indices_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indices_)); indices_ = NULL; } if (weight_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(weight_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(weight_)); weight_ = NULL; } if (grad_features_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_features_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_features_)); grad_features_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/three_interpolate_backward/three_interpolate_backward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/three_interpolate_backward/three_interpolate_backward_general.cpp index 158637b57..721b68c3f 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/three_interpolate_backward/three_interpolate_backward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/three_interpolate_backward/three_interpolate_backward_general.cpp @@ -60,7 +60,7 @@ class three_interprolate_backward_general grad_output_shape.data())); uint64_t grad_output_ele_num = mluOpGetTensorElementNum(grad_output_desc_); if (grad_output_ele_num > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&grad_output_, 8)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_output_, 8)) } MLUOP_CHECK(mluOpCreateTensorDescriptor(&indices_desc_)); @@ -74,7 +74,7 @@ class three_interprolate_backward_general indices_shape.data())); uint64_t indices_ele_num = mluOpGetTensorElementNum(indices_desc_); if (indices_ele_num > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&indices_, 8)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indices_, 8)); } MLUOP_CHECK(mluOpCreateTensorDescriptor(&weight_desc_)); @@ -88,7 +88,7 @@ class three_interprolate_backward_general weight_shape.data())); uint64_t weight_ele_num = mluOpGetTensorElementNum(weight_desc_); if (weight_ele_num > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&weight_, 8)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&weight_, 8)); } MLUOP_CHECK(mluOpCreateTensorDescriptor(&grad_features_desc_)); @@ -104,7 +104,7 @@ class three_interprolate_backward_general uint64_t grad_features_ele_num = mluOpGetTensorElementNum(grad_features_desc_); if (grad_features_ele_num > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&grad_features_, 8)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_features_, 8)); } } @@ -146,19 +146,19 @@ class three_interprolate_backward_general grad_features_desc_ = NULL; } if (grad_output_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_output_)); grad_output_ = NULL; } if (indices_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indices_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indices_)); indices_ = NULL; } if (weight_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(weight_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(weight_)); weight_ = NULL; } if (grad_features_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_features_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_features_)); grad_features_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/three_interprolate_forward/three_interprolate_forward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/three_interprolate_forward/three_interprolate_forward.cpp index b297fd25f..b6f85781a 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/three_interprolate_forward/three_interprolate_forward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/three_interprolate_forward/three_interprolate_forward.cpp @@ -71,25 +71,25 @@ class three_interprolate_forward : public testing::Test { size_t f_ele_num = 1 * 2 * 5; size_t f_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t f_bytes = f_ele_num * f_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&features_, f_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&features_, f_bytes)); } if (indices) { size_t i_ele_num = 1 * 4 * 3; size_t i_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_INT32); size_t i_bytes = i_ele_num * i_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&indices_, i_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indices_, i_bytes)); } if (weight) { size_t w_ele_num = 1 * 4 * 3; size_t w_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_INT32); size_t w_bytes = w_ele_num * w_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&weight_, w_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&weight_, w_bytes)); } if (output) { size_t o_ele_num = 1 * 2 * 4; size_t o_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_INT32); size_t o_bytes = o_ele_num * o_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&output_, o_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_, o_bytes)); } } @@ -126,19 +126,19 @@ class three_interprolate_forward : public testing::Test { output_desc_ = NULL; } if (features_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(features_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(features_)); features_ = NULL; } if (indices_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indices_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indices_)); indices_ = NULL; } if (weight_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(weight_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(weight_)); weight_ = NULL; } if (output_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); output_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/three_interprolate_forward/three_interprolate_forward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/three_interprolate_forward/three_interprolate_forward_general.cpp index d35d8fe71..abdcbdb7d 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/three_interprolate_forward/three_interprolate_forward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/three_interprolate_forward/three_interprolate_forward_general.cpp @@ -59,7 +59,7 @@ class three_interprolate_forward_general f_dim, f_shape.data())); uint64_t f_ele_num = mluOpGetTensorElementNum(features_desc_); if (f_ele_num > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&features_, 8)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&features_, 8)) } MLUOP_CHECK(mluOpCreateTensorDescriptor(&indices_desc_)); @@ -72,7 +72,7 @@ class three_interprolate_forward_general i_dim, i_shape.data())); uint64_t i_ele_num = mluOpGetTensorElementNum(indices_desc_); if (i_ele_num > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&indices_, 8)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&indices_, 8)); } MLUOP_CHECK(mluOpCreateTensorDescriptor(&weight_desc_)); @@ -85,7 +85,7 @@ class three_interprolate_forward_general w_shape.data())); uint64_t w_ele_num = mluOpGetTensorElementNum(weight_desc_); if (w_ele_num > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&weight_, 8)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&weight_, 8)); } MLUOP_CHECK(mluOpCreateTensorDescriptor(&output_desc_)); @@ -98,7 +98,7 @@ class three_interprolate_forward_general o_shape.data())); uint64_t o_ele_num = mluOpGetTensorElementNum(output_desc_); if (o_ele_num > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&output_, 8)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_, 8)); } } @@ -140,19 +140,19 @@ class three_interprolate_forward_general output_desc_ = NULL; } if (features_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(features_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(features_)); features_ = NULL; } if (indices_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(indices_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(indices_)); indices_ = NULL; } if (weight_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(weight_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(weight_)); weight_ = NULL; } if (output_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); output_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/three_nn_forward/three_nn_forward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/three_nn_forward/three_nn_forward.cpp index da72f300f..8583867f9 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/three_nn_forward/three_nn_forward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/three_nn_forward/three_nn_forward.cpp @@ -47,7 +47,7 @@ class three_nn_forward : public testing::Test { unknown_dims.data())); } if (unknown) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&unknown_, 6 * 4)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&unknown_, 6 * 4)) } if (known_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&known_desc_)); @@ -57,10 +57,10 @@ class three_nn_forward : public testing::Test { known_dims.data())); } if (known) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&known_, 30 * 4)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&known_, 30 * 4)) } if (workspace) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)) } if (dist2_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&dist2_desc_)); @@ -70,7 +70,7 @@ class three_nn_forward : public testing::Test { dist2_dims.data())); } if (dist2) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&dist2_, 6 * 4)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&dist2_, 6 * 4)) } if (idx_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&idx_desc_)); @@ -80,7 +80,7 @@ class three_nn_forward : public testing::Test { idx_dims.data())); } if (idx) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&idx_, 6 * 4)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&idx_, 6 * 4)) } } @@ -104,7 +104,7 @@ class three_nn_forward : public testing::Test { unknown_desc_ = NULL; } if (unknown_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(unknown_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(unknown_)); unknown_ = NULL; } if (known_desc_) { @@ -112,11 +112,11 @@ class three_nn_forward : public testing::Test { known_desc_ = NULL; } if (known_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(known_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(known_)); known_ = NULL; } if (workspace_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = NULL; } if (dist2_desc_) { @@ -124,7 +124,7 @@ class three_nn_forward : public testing::Test { dist2_desc_ = NULL; } if (dist2_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(dist2_)) + GTEST_CHECK(cnrtSuccess == cnrtFree(dist2_)) dist2_ = NULL; } if (idx_desc_) { @@ -132,7 +132,7 @@ class three_nn_forward : public testing::Test { idx_desc_ = NULL; } if (idx_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(idx_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(idx_)); idx_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/three_nn_forward/three_nn_forward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/three_nn_forward/three_nn_forward_general.cpp index 2f21434ac..e95ad73b4 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/three_nn_forward/three_nn_forward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/three_nn_forward/three_nn_forward_general.cpp @@ -57,7 +57,7 @@ class three_nn_forward_general size_t u_dtype_size; MLUOP_CHECK(mluOpGetSizeOfDataType(u_dtype, &u_dtype_size)); uint64_t u_bytes = u_dtype_size * mluOpGetTensorElementNum(unknown_desc_); - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&unknown_, u_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&unknown_, u_bytes)) } MLUOpTensorParam k_params = std::get<1>(GetParam()); @@ -70,7 +70,7 @@ class three_nn_forward_general size_t k_dtype_size; MLUOP_CHECK(mluOpGetSizeOfDataType(k_dtype, &k_dtype_size)); uint64_t k_bytes = k_dtype_size * mluOpGetTensorElementNum(known_desc_); - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&known_, k_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&known_, k_bytes)) } MLUOpTensorParam d_params = std::get<2>(GetParam()); @@ -83,7 +83,7 @@ class three_nn_forward_general size_t d_dtype_size; MLUOP_CHECK(mluOpGetSizeOfDataType(d_dtype, &d_dtype_size)); uint64_t d_bytes = d_dtype_size * mluOpGetTensorElementNum(dist2_desc_); - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&dist2_, d_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&dist2_, d_bytes)) } MLUOpTensorParam id_params = std::get<3>(GetParam()); @@ -96,7 +96,7 @@ class three_nn_forward_general size_t id_dtype_size; MLUOP_CHECK(mluOpGetSizeOfDataType(id_dtype, &id_dtype_size)); uint64_t id_bytes = id_dtype_size * mluOpGetTensorElementNum(dist2_desc_); - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&idx_, id_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&idx_, id_bytes)) } } bool compute() { @@ -111,7 +111,7 @@ class three_nn_forward_general destroy(); return expected_status_ == status; } - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)) status = mluOpThreeNNForward(handle_, unknown_desc_, unknown_, known_desc_, known_, workspace_, workspace_size_, dist2_desc_, dist2_, idx_desc_, idx_); @@ -131,7 +131,7 @@ class three_nn_forward_general unknown_desc_ = NULL; } if (unknown_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(unknown_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(unknown_)); unknown_ = NULL; } if (known_desc_) { @@ -139,11 +139,11 @@ class three_nn_forward_general known_desc_ = NULL; } if (known_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(known_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(known_)); known_ = NULL; } if (workspace_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = NULL; } if (dist2_desc_) { @@ -151,7 +151,7 @@ class three_nn_forward_general dist2_desc_ = NULL; } if (dist2_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(dist2_)) + GTEST_CHECK(cnrtSuccess == cnrtFree(dist2_)) dist2_ = NULL; } if (idx_desc_) { @@ -159,7 +159,7 @@ class three_nn_forward_general idx_desc_ = NULL; } if (idx_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(idx_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(idx_)); idx_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/tin_shift_backward/tin_shift_backward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/tin_shift_backward/tin_shift_backward.cpp index a3682bc95..98042ba5a 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/tin_shift_backward/tin_shift_backward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/tin_shift_backward/tin_shift_backward.cpp @@ -49,12 +49,12 @@ class tin_shift_backward : public testing::Test { } if (grad_output) { if (grad_output_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_output_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum( grad_output_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_output_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -68,11 +68,11 @@ class tin_shift_backward : public testing::Test { if (shifts) { if (shifts_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&shifts_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum(shifts_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&shifts_, MLUOP_DTYPE_INT32 * 2)); } } @@ -85,12 +85,12 @@ class tin_shift_backward : public testing::Test { } if (grad_input) { if (grad_input_desc) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_input_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(grad_input_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&grad_input_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -125,15 +125,15 @@ class tin_shift_backward : public testing::Test { grad_input_desc_ = NULL; } if (grad_output_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_output_)); grad_output_ = NULL; } if (shifts_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(shifts_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(shifts_)); shifts_ = NULL; } if (grad_input_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_input_)); grad_input_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/tin_shift_backward/tin_shift_backward.general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/tin_shift_backward/tin_shift_backward.general.cpp index 7cb0118c6..93811cb09 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/tin_shift_backward/tin_shift_backward.general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/tin_shift_backward/tin_shift_backward.general.cpp @@ -52,12 +52,12 @@ class tin_shift_backward_general grad_output_params.get_dim_size().data())); if (mluOpGetTensorElementNum(grad_output_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_output_, mluOpDataTypeBytes(grad_output_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_output_, mluOpDataTypeBytes(grad_output_params.get_dtype()) * mluOpGetTensorElementNum(grad_output_desc_))); @@ -70,12 +70,12 @@ class tin_shift_backward_general shifts_params.get_dim_nb(), shifts_params.get_dim_size().data())); if (mluOpGetTensorElementNum(shifts_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&shifts_, mluOpDataTypeBytes(shifts_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&shifts_, mluOpDataTypeBytes(shifts_params.get_dtype()) * mluOpGetTensorElementNum(shifts_desc_))); } @@ -88,12 +88,12 @@ class tin_shift_backward_general grad_input_params.get_dim_size().data())); if (mluOpGetTensorElementNum(grad_input_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_input_, mluOpDataTypeBytes(grad_input_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&grad_input_, mluOpDataTypeBytes(grad_input_params.get_dtype()) * mluOpGetTensorElementNum(grad_input_desc_))); @@ -140,15 +140,15 @@ class tin_shift_backward_general grad_input_desc_ = NULL; } if (grad_output_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_output_)); grad_output_ = NULL; } if (shifts_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(shifts_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(shifts_)); shifts_ = NULL; } if (grad_input_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(grad_input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(grad_input_)); grad_input_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/tin_shift_forward/tin_shift_forward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/tin_shift_forward/tin_shift_forward.cpp index 38f4eab32..2d3b09e04 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/tin_shift_forward/tin_shift_forward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/tin_shift_forward/tin_shift_forward.cpp @@ -49,11 +49,11 @@ class tin_shift_forward : public testing::Test { if (input) { if (input_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&input_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(input_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -67,11 +67,11 @@ class tin_shift_forward : public testing::Test { if (shifts) { if (shifts_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&shifts_, MLUOP_DTYPE_INT32 * mluOpGetTensorElementNum(shifts_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&shifts_, MLUOP_DTYPE_INT32 * 2)); } } @@ -85,11 +85,11 @@ class tin_shift_forward : public testing::Test { if (output) { if (output_desc) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&output_, MLUOP_DTYPE_FLOAT * mluOpGetTensorElementNum(output_desc_))); } else { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_, MLUOP_DTYPE_FLOAT * 2)); } } @@ -124,15 +124,15 @@ class tin_shift_forward : public testing::Test { output_desc_ = NULL; } if (input_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); input_ = NULL; } if (shifts_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(shifts_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(shifts_)); shifts_ = NULL; } if (output_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); output_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/tin_shift_forward/tin_shift_forward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/tin_shift_forward/tin_shift_forward_general.cpp index 545dc0bb5..5dcb0eb2c 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/tin_shift_forward/tin_shift_forward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/tin_shift_forward/tin_shift_forward_general.cpp @@ -51,12 +51,12 @@ class tin_shift_forward_general input_params.get_dim_nb(), input_params.get_dim_size().data())); if (mluOpGetTensorElementNum(input_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&input_, mluOpDataTypeBytes(input_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&input_, mluOpDataTypeBytes(input_params.get_dtype()) * mluOpGetTensorElementNum(input_desc_))); } @@ -68,12 +68,12 @@ class tin_shift_forward_general shifts_params.get_dim_nb(), shifts_params.get_dim_size().data())); if (mluOpGetTensorElementNum(shifts_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&shifts_, mluOpDataTypeBytes(shifts_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&shifts_, mluOpDataTypeBytes(shifts_params.get_dtype()) * mluOpGetTensorElementNum(shifts_desc_))); } @@ -85,12 +85,12 @@ class tin_shift_forward_general output_params.get_dim_nb(), output_params.get_dim_size().data())); if (mluOpGetTensorElementNum(output_desc_) >= LARGE_TENSOR_NUM) { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&output_, mluOpDataTypeBytes(output_params.get_dtype()) * 2)); } else { GTEST_CHECK( - CNRT_RET_SUCCESS == + cnrtSuccess == cnrtMalloc(&output_, mluOpDataTypeBytes(output_params.get_dtype()) * mluOpGetTensorElementNum(output_desc_))); } @@ -136,15 +136,15 @@ class tin_shift_forward_general output_desc_ = NULL; } if (input_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_)); input_ = NULL; } if (shifts_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(shifts_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(shifts_)); shifts_ = NULL; } if (output_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_)); output_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/voxel_pooling_forward/voxel_pooling_forward.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/voxel_pooling_forward/voxel_pooling_forward.cpp index c9afe610a..c3873c1e6 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/voxel_pooling_forward/voxel_pooling_forward.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/voxel_pooling_forward/voxel_pooling_forward.cpp @@ -72,25 +72,25 @@ class voxel_pooling_forward : public testing::Test { size_t g_ele_num = 2 * 4 * 3; size_t g_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_INT32); size_t g_bytes = g_ele_num * g_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&geom_xyz_, g_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&geom_xyz_, g_bytes)); } if (input_features) { size_t i_ele_num = 2 * 4 * 10; size_t i_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t i_bytes = i_ele_num * i_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&input_features_, i_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_features_, i_bytes)); } if (output_features) { size_t o_ele_num = 2 * 5 * 6 * 10; size_t o_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t o_bytes = o_ele_num * o_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&output_features_, o_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_features_, o_bytes)); } if (pos_memo) { size_t p_ele_num = 2 * 4 * 3; size_t p_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_INT32); size_t p_bytes = p_ele_num * p_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&pos_memo_, p_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&pos_memo_, p_bytes)); } } @@ -129,19 +129,19 @@ class voxel_pooling_forward : public testing::Test { pos_memo_desc_ = NULL; } if (geom_xyz_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(geom_xyz_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(geom_xyz_)); geom_xyz_ = NULL; } if (input_features_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_features_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_features_)); input_features_ = NULL; } if (output_features_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_features_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_features_)); output_features_ = NULL; } if (pos_memo_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(pos_memo_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(pos_memo_)); output_features_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/voxel_pooling_forward/voxel_pooling_forward_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/voxel_pooling_forward/voxel_pooling_forward_general.cpp index 36e065e58..8f2e4be28 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/voxel_pooling_forward/voxel_pooling_forward_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/voxel_pooling_forward/voxel_pooling_forward_general.cpp @@ -71,7 +71,7 @@ class voxel_pooling_forward_general g_bytes = mluOpDataTypeBytes(g_dtype) * 36; } if (g_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&geom_xyz_, g_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&geom_xyz_, g_bytes)) } MLUOpTensorParam i_params = std::get<2>(GetParam()); @@ -90,7 +90,7 @@ class voxel_pooling_forward_general i_bytes = mluOpDataTypeBytes(g_dtype) * 80; } if (i_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&input_features_, i_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&input_features_, i_bytes)) } MLUOpTensorParam o_params = std::get<3>(GetParam()); @@ -109,7 +109,7 @@ class voxel_pooling_forward_general o_bytes = mluOpDataTypeBytes(g_dtype) * 600; } if (o_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&output_features_, o_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&output_features_, o_bytes)) } MLUOpTensorParam p_params = std::get<4>(GetParam()); @@ -128,7 +128,7 @@ class voxel_pooling_forward_general p_bytes = mluOpDataTypeBytes(p_dtype) * 600; } if (p_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&pos_memo_, p_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&pos_memo_, p_bytes)) } } @@ -171,19 +171,19 @@ class voxel_pooling_forward_general pos_memo_desc_ = NULL; } if (geom_xyz_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(geom_xyz_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(geom_xyz_)); geom_xyz_ = NULL; } if (input_features_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(input_features_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(input_features_)); input_features_ = NULL; } if (output_features_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(output_features_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(output_features_)); output_features_ = NULL; } if (pos_memo_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(pos_memo_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(pos_memo_)); output_features_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/voxelization/voxelization.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/voxelization/voxelization.cpp index 4a5d16156..969f44ca7 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/voxelization/voxelization.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/voxelization/voxelization.cpp @@ -55,7 +55,7 @@ class voxelization : public testing::Test { size_t points_ele_num = 1 * 2; size_t points_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t points_bytes = points_ele_num * points_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&points_, points_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&points_, points_bytes)); } if (voxel_size_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&voxel_size_desc_)); @@ -68,7 +68,7 @@ class voxelization : public testing::Test { size_t voxel_size_ele_num = 3; size_t voxel_size_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t voxel_size_bytes = voxel_size_ele_num * voxel_size_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&voxel_size_, voxel_size_bytes)); } if (coors_range_desc) { @@ -82,14 +82,14 @@ class voxelization : public testing::Test { size_t coors_range_ele_num = 6; size_t coors_range_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t coors_range_bytes = coors_range_ele_num * coors_range_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&coors_range_, coors_range_bytes)); } if (workspace) { size_t workspace_ele_num = workspace_size_; size_t workspace_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t workspace_bytes = workspace_ele_num * workspace_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_bytes)); } if (voxels_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&voxels_desc_)); @@ -102,7 +102,7 @@ class voxelization : public testing::Test { size_t voxels_ele_num = 5 * 4 * 2; size_t voxels_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t voxels_bytes = voxels_ele_num * voxels_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&voxels_, voxels_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&voxels_, voxels_bytes)); } if (coors_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&coors_desc_)); @@ -115,7 +115,7 @@ class voxelization : public testing::Test { size_t coors_ele_num = 5 * 3; size_t coors_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t coors_bytes = coors_ele_num * coors_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&coors_, coors_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&coors_, coors_bytes)); } if (num_points_per_voxel_desc) { MLUOP_CHECK(mluOpCreateTensorDescriptor(&num_points_per_voxel_desc_)); @@ -130,7 +130,7 @@ class voxelization : public testing::Test { mluOpDataTypeBytes(MLUOP_DTYPE_INT32); size_t num_points_per_voxel_bytes = num_points_per_voxel_ele_num * num_points_per_voxel_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&num_points_per_voxel_, + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&num_points_per_voxel_, num_points_per_voxel_bytes)); } if (voxel_num_desc) { @@ -144,7 +144,7 @@ class voxelization : public testing::Test { size_t voxel_num_ele_num = 1; size_t voxel_num_dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_INT32); size_t voxel_num_bytes = voxel_num_ele_num * voxel_num_dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&voxel_num_, voxel_num_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&voxel_num_, voxel_num_bytes)); } } @@ -171,7 +171,7 @@ class voxelization : public testing::Test { points_desc_ = NULL; } if (points_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(points_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(points_)); points_ = NULL; } if (voxel_size_desc_) { @@ -179,7 +179,7 @@ class voxelization : public testing::Test { voxel_size_desc_ = NULL; } if (voxel_size_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(voxel_size_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(voxel_size_)); voxel_size_ = NULL; } if (coors_range_desc_) { @@ -187,11 +187,11 @@ class voxelization : public testing::Test { coors_range_desc_ = NULL; } if (coors_range_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(coors_range_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(coors_range_)); coors_range_ = NULL; } if (workspace_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = NULL; } if (voxels_desc_) { @@ -199,7 +199,7 @@ class voxelization : public testing::Test { voxels_desc_ = NULL; } if (voxels_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(voxels_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(voxels_)); voxels_ = NULL; } if (coors_desc_) { @@ -207,7 +207,7 @@ class voxelization : public testing::Test { coors_desc_ = NULL; } if (coors_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(coors_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(coors_)); coors_ = NULL; } if (num_points_per_voxel_desc_) { @@ -215,7 +215,7 @@ class voxelization : public testing::Test { num_points_per_voxel_desc_ = NULL; } if (num_points_per_voxel_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(num_points_per_voxel_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(num_points_per_voxel_)); num_points_per_voxel_ = NULL; } if (voxel_num_desc_) { @@ -223,7 +223,7 @@ class voxelization : public testing::Test { voxel_num_desc_ = NULL; } if (voxel_num_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(voxel_num_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(voxel_num_)); voxel_num_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/voxelization/voxelization_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/voxelization/voxelization_general.cpp index 77e4661e5..0fc24ad8a 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/voxelization/voxelization_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/voxelization/voxelization_general.cpp @@ -59,7 +59,7 @@ class voxelization_general : public testing::TestWithParam { uint64_t points_ele_num = mluOpGetTensorElementNum(points_desc_); uint64_t points_bytes = mluOpDataTypeBytes(points_dtype) * points_ele_num; if (points_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&points_, points_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&points_, points_bytes)) } MLUOP_CHECK(mluOpCreateTensorDescriptor(&voxel_size_desc_)); @@ -75,7 +75,7 @@ class voxelization_general : public testing::TestWithParam { uint64_t voxel_size_bytes = mluOpDataTypeBytes(voxel_size_dtype) * voxel_size_ele_num; if (voxel_size_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&voxel_size_, voxel_size_bytes)) } @@ -92,7 +92,7 @@ class voxelization_general : public testing::TestWithParam { uint64_t coors_range_bytes = mluOpDataTypeBytes(coors_range_dtype) * coors_range_ele_num; if (coors_range_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&coors_range_, coors_range_bytes)) } @@ -112,7 +112,7 @@ class voxelization_general : public testing::TestWithParam { uint64_t voxels_ele_num = mluOpGetTensorElementNum(voxels_desc_); uint64_t voxels_bytes = mluOpDataTypeBytes(voxels_dtype) * voxels_ele_num; if (voxels_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&voxels_, voxels_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&voxels_, voxels_bytes)) } MLUOP_CHECK(mluOpCreateTensorDescriptor(&coors_desc_)); @@ -126,7 +126,7 @@ class voxelization_general : public testing::TestWithParam { uint64_t coors_ele_num = mluOpGetTensorElementNum(coors_desc_); uint64_t coors_bytes = mluOpDataTypeBytes(coors_dtype) * coors_ele_num; if (coors_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&coors_, coors_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&coors_, coors_bytes)) } MLUOP_CHECK(mluOpCreateTensorDescriptor(&num_points_per_voxel_desc_)); @@ -148,7 +148,7 @@ class voxelization_general : public testing::TestWithParam { mluOpDataTypeBytes(num_points_per_voxel_dtype) * num_points_per_voxel_ele_num; if (num_points_per_voxel_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&num_points_per_voxel_, + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&num_points_per_voxel_, num_points_per_voxel_bytes)) } @@ -165,7 +165,7 @@ class voxelization_general : public testing::TestWithParam { uint64_t voxel_num_bytes = mluOpDataTypeBytes(voxel_num_dtype) * voxel_num_ele_num; if (voxel_num_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&voxel_num_, voxel_num_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&voxel_num_, voxel_num_bytes)) } PublicParam publicParam = std::get<8>(GetParam()); @@ -186,7 +186,7 @@ class voxelization_general : public testing::TestWithParam { destroy(); return status == expected_status_; } - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&workspace_, workspace_size_)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&workspace_, workspace_size_)); status = mluOpVoxelization( handle_, points_desc_, points_, voxel_size_desc_, voxel_size_, coors_range_desc_, coors_range_, max_points_, max_voxels_, NDim_, @@ -209,7 +209,7 @@ class voxelization_general : public testing::TestWithParam { points_desc_ = NULL; } if (points_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(points_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(points_)); points_ = NULL; } if (voxel_size_desc_) { @@ -217,7 +217,7 @@ class voxelization_general : public testing::TestWithParam { voxel_size_desc_ = NULL; } if (voxel_size_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(voxel_size_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(voxel_size_)); voxel_size_ = NULL; } if (coors_range_desc_) { @@ -225,11 +225,11 @@ class voxelization_general : public testing::TestWithParam { coors_range_desc_ = NULL; } if (coors_range_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(coors_range_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(coors_range_)); coors_range_ = NULL; } if (workspace_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(workspace_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(workspace_)); workspace_ = NULL; } if (voxels_desc_) { @@ -237,7 +237,7 @@ class voxelization_general : public testing::TestWithParam { voxels_desc_ = NULL; } if (voxels_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(voxels_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(voxels_)); voxels_ = NULL; } if (coors_desc_) { @@ -245,7 +245,7 @@ class voxelization_general : public testing::TestWithParam { coors_desc_ = NULL; } if (coors_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(coors_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(coors_)); coors_ = NULL; } if (num_points_per_voxel_desc_) { @@ -253,7 +253,7 @@ class voxelization_general : public testing::TestWithParam { num_points_per_voxel_desc_ = NULL; } if (num_points_per_voxel_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(num_points_per_voxel_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(num_points_per_voxel_)); num_points_per_voxel_ = NULL; } if (voxel_num_desc_) { @@ -261,7 +261,7 @@ class voxelization_general : public testing::TestWithParam { voxel_num_desc_ = NULL; } if (voxel_num_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(voxel_num_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(voxel_num_)); voxel_num_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/yolo_box/yolo_box.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/yolo_box/yolo_box.cpp index bde847869..194daa461 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/yolo_box/yolo_box.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/yolo_box/yolo_box.cpp @@ -78,31 +78,31 @@ class yolo_box : public testing::Test { size_t ele_num = 8; size_t dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t bytes = ele_num * dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&x_, bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&x_, bytes)); } if (img_size) { size_t ele_num = 8; size_t dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_INT32); size_t bytes = ele_num * dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&img_size_, bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&img_size_, bytes)); } if (anchors) { size_t ele_num = 8; size_t dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_INT32); size_t bytes = ele_num * dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&anchors_, bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&anchors_, bytes)); } if (boxes) { size_t ele_num = 8; size_t dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t bytes = ele_num * dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&boxes_, bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&boxes_, bytes)); } if (scores) { size_t ele_num = 8; size_t dtype_bytes = mluOpDataTypeBytes(MLUOP_DTYPE_FLOAT); size_t bytes = ele_num * dtype_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&scores_, bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&scores_, bytes)); } } @@ -145,23 +145,23 @@ class yolo_box : public testing::Test { scores_desc_ = NULL; } if (x_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(x_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(x_)); x_ = NULL; } if (img_size_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(img_size_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(img_size_)); img_size_ = NULL; } if (anchors_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(anchors_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(anchors_)); anchors_ = NULL; } if (boxes_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(boxes_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(boxes_)); boxes_ = NULL; } if (scores_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(scores_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(scores_)); scores_ = NULL; } } diff --git a/test/mlu_op_gtest/api_gtest/src/gtest/yolo_box/yolo_box_general.cpp b/test/mlu_op_gtest/api_gtest/src/gtest/yolo_box/yolo_box_general.cpp index 735358fd8..193b3694f 100644 --- a/test/mlu_op_gtest/api_gtest/src/gtest/yolo_box/yolo_box_general.cpp +++ b/test/mlu_op_gtest/api_gtest/src/gtest/yolo_box/yolo_box_general.cpp @@ -61,7 +61,7 @@ class yolo_box_general : public testing::TestWithParam { uint64_t x_ele_num = mluOpGetTensorElementNum(x_desc_); uint64_t x_bytes = mluOpDataTypeBytes(x_dtype) * x_ele_num; if (x_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&x_, x_bytes)) + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&x_, x_bytes)) } MLUOP_CHECK(mluOpCreateTensorDescriptor(&img_size_desc_)); @@ -75,7 +75,7 @@ class yolo_box_general : public testing::TestWithParam { uint64_t img_ele_num = mluOpGetTensorElementNum(img_size_desc_); uint64_t img_bytes = mluOpDataTypeBytes(img_dtype) * img_ele_num; if (img_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&img_size_, img_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&img_size_, img_bytes)); } MLUOP_CHECK(mluOpCreateTensorDescriptor(&anchors_desc_)); @@ -89,7 +89,7 @@ class yolo_box_general : public testing::TestWithParam { uint64_t an_ele_num = mluOpGetTensorElementNum(anchors_desc_); uint64_t an_bytes = mluOpDataTypeBytes(an_dtype) * an_ele_num; if (an_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&anchors_, an_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&anchors_, an_bytes)); } MLUOP_CHECK(mluOpCreateTensorDescriptor(&boxes_desc_)); @@ -103,7 +103,7 @@ class yolo_box_general : public testing::TestWithParam { uint64_t b_ele_num = mluOpGetTensorElementNum(boxes_desc_); uint64_t b_bytes = mluOpDataTypeBytes(b_dtype) * b_ele_num; if (b_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&boxes_, b_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&boxes_, b_bytes)); } MLUOP_CHECK(mluOpCreateTensorDescriptor(&scores_desc_)); @@ -117,7 +117,7 @@ class yolo_box_general : public testing::TestWithParam { uint64_t s_ele_num = mluOpGetTensorElementNum(scores_desc_); uint64_t s_bytes = mluOpDataTypeBytes(s_dtype) * s_ele_num; if (s_bytes > 0) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&scores_, s_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&scores_, s_bytes)); } YoloBoxDescParam params = std::get<5>(GetParam()); @@ -169,23 +169,23 @@ class yolo_box_general : public testing::TestWithParam { scores_desc_ = NULL; } if (x_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(x_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(x_)); x_ = NULL; } if (img_size_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(img_size_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(img_size_)); img_size_ = NULL; } if (anchors_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(anchors_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(anchors_)); anchors_ = NULL; } if (boxes_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(boxes_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(boxes_)); boxes_ = NULL; } if (scores_) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(scores_)); + GTEST_CHECK(cnrtSuccess == cnrtFree(scores_)); scores_ = NULL; } } diff --git a/test/mlu_op_gtest/include/tools.h b/test/mlu_op_gtest/include/tools.h index 487128faf..f5aa3ab6a 100644 --- a/test/mlu_op_gtest/include/tools.h +++ b/test/mlu_op_gtest/include/tools.h @@ -67,7 +67,6 @@ size_t shapeStrideCount(const Shape *shape); // not include stride size_t shapeElementCount(const Shape *shape); -cnrtDataType_t cvtMluOpDtypeToCnrt(mluOpDataType_t dtype); cnrtDataType_V2_t cvtMluOpDtypeToCnrt_V2(mluOpDataType_t dtype); mluOpDataType_t cvtProtoDtypeToMluOp(DataType dtype); mluOpTensorLayout_t cvtProtoLayoutToMluOp(TensorLayout order); diff --git a/test/mlu_op_gtest/pb_gtest/gtest_config/test_list b/test/mlu_op_gtest/pb_gtest/gtest_config/test_list index 1fc0c0109..bfbf8c8bf 100644 --- a/test/mlu_op_gtest/pb_gtest/gtest_config/test_list +++ b/test/mlu_op_gtest/pb_gtest/gtest_config/test_list @@ -27,6 +27,8 @@ roi_align_backward roi_align_rotated_backward roi_align_rotated_forward roialign_forward +roi_pooling_backward +roi_pooling_forward roiaware_pool3d_backward roiaware_pool3d_forward roipoint_pool3d diff --git a/test/mlu_op_gtest/pb_gtest/include/runtime.h b/test/mlu_op_gtest/pb_gtest/include/runtime.h index cdfb2c0ae..9bc9fdeaa 100644 --- a/test/mlu_op_gtest/pb_gtest/include/runtime.h +++ b/test/mlu_op_gtest/pb_gtest/include/runtime.h @@ -37,6 +37,10 @@ #include "tools.h" #include "memory_pool.h" +#ifndef CNRT_RET_ERR_INVALID +#define CNRT_RET_ERR_INVALID (632007) +#endif + namespace mluoptest { class Runtime { @@ -49,11 +53,11 @@ class Runtime { void *allocate(size_t num_bytes, std::string name = "") { return NULL; } // this function will throw exception - cnrtRet_t deallocate(void *ptr) { return CNRT_RET_SUCCESS; } + cnrtRet_t deallocate(void *ptr) { return cnrtSuccess; } // this function won't throw exception // so only this function can be called in dtor - cnrtRet_t destroy() { return CNRT_RET_SUCCESS; } + cnrtRet_t destroy() { return cnrtSuccess; } // use cnrtRet_t, cuz when call cnrtFree .. can return directly. }; @@ -98,7 +102,7 @@ class CPURuntime : public Runtime { template cnrtRet_t deallocate(T object) { if (NULL == (void *)object) { - return CNRT_RET_SUCCESS; + return cnrtSuccess; } auto it = std::find_if(memory_blocks_.begin(), memory_blocks_.end(), [=](std::shared_ptr b) { @@ -115,7 +119,7 @@ class CPURuntime : public Runtime { // XXX(zhaolianshui): since using erase rather frequently, memory_blocks_ // should be std::list? memory_blocks_.erase(it); - return CNRT_RET_SUCCESS; + return cnrtSuccess; } // so only this function can be called in dtor diff --git a/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto b/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto index 17b9edf2d..84dfa8001 160000 --- a/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto +++ b/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto @@ -1 +1 @@ -Subproject commit 17b9edf2d55195a2d0ee51f5673add5f41c945d8 +Subproject commit 84dfa8001df9841f79417a086b7bf3fc56848eb8 diff --git a/test/mlu_op_gtest/pb_gtest/src/executor.cpp b/test/mlu_op_gtest/pb_gtest/src/executor.cpp index bc5517e07..b8ac27704 100644 --- a/test/mlu_op_gtest/pb_gtest/src/executor.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/executor.cpp @@ -84,11 +84,6 @@ Executor::~Executor() { hostFree(); baselineFree(); destroyTensors(); -// #if GTEST_ENABLE_GPERFTOOLS -// if (exe_config_ != nullptr && exe_config_->gtest_internal_cpu_profile) { -// ProfilerStop(); -// } -// #endif VLOG(4) << "Executor end."; } @@ -294,9 +289,9 @@ void Executor::launch() { bool Executor::ready() { auto ret = cnrtQueueWaitNotifier(exe_context_->hw_notifier->n_stop, exe_context_->queue, 0); - if (CNRT_RET_ERR_NOT_READY == ret) { + if (cnrtErrorNotReady == ret) { return false; - } else if (CNRT_RET_SUCCESS == ret) { + } else if (cnrtSuccess == ret) { return true; } else { GTEST_CHECK(false, @@ -386,7 +381,7 @@ void Executor::setupForPerfIter(int repeat, int iter, int iter_start) { void *src_data = getPerfSrcData(db); GTEST_CHECK(cnrtMemcpy(db->device_perf_ptr, src_data, db->size, CNRT_MEM_TRANS_DIR_DEV2DEV) == - CNRT_RET_SUCCESS); + cnrtSuccess); oss << "copy data from " << src_data; } else { oss << "random data is fed to MLU kernel, if it should be real " @@ -465,7 +460,7 @@ void Executor::setupForPerfIter(int repeat, int iter, int iter_start) { if (skipMallocDevice(db.getMetaTensor())) continue; void *src_data = getPerfSrcData(&db); GTEST_CHECK(cnrtMemcpy(db.device_perf_ptr, src_data, db.size, - CNRT_MEM_TRANS_DIR_DEV2DEV) == CNRT_RET_SUCCESS); + CNRT_MEM_TRANS_DIR_DEV2DEV) == cnrtSuccess); } } } @@ -534,10 +529,10 @@ std::tuple Executor::callBackKernelSyncAndGetTime( float hwtime = 0; cnrtNotifier_t n_start = notifier->n_start; cnrtNotifier_t n_stop = notifier->n_stop; - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtPlaceNotifier(n_start, exe_context_->queue)); launch_kernel(); - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtPlaceNotifier(n_stop, exe_context_->queue)); GTEST_CHECK(cnrtSuccess == cnrtQueueSync(exe_context_->queue)); size_t tp = MONITOR_CLOCK::now().time_since_epoch().count(); @@ -746,10 +741,9 @@ void Executor::getMluPerfInfo(PerfInfo *res) { // compute res->compute_force = getPeakComputeForce(); + res->theory_ops = getTheoryOps(); if (parser_->node()->has_theory_compute_ops()) { res->theory_ops = parser_->node()->theory_compute_ops(); - } else { - res->theory_ops = getTheoryOps(); } // op / ( (latency(us) / 1000 / 1000) * PEAK_COMPUTE_FORCE(op/s) ) @@ -921,7 +915,7 @@ bool Executor::checkBaseline() { double Executor::getBandWidthByDev() { int card = -1; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtGetDevice(&card)); + GTEST_CHECK(cnrtSuccess == cnrtGetDevice(&card)); GTEST_CHECK(cndevInit(0) == CNDEV_SUCCESS); cndevDDRInfo_t ddrinfo; ddrinfo.version = CNDEV_VERSION_5; @@ -938,14 +932,14 @@ double Executor::getBandWidthByDev() { int Executor::getIpuFrequency() { int ordinal = -1; int ipu_frequency = -1; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtGetDevice(&ordinal)); + GTEST_CHECK(cnrtSuccess == cnrtGetDevice(&ordinal)); GTEST_CHECK(cndevInit(0) == CNDEV_SUCCESS); cndevFrequencyInfo_t freqInfo; freqInfo.version = CNDEV_VERSION_5; GTEST_CHECK(CN_SUCCESS == cnDeviceGetAttribute(&ipu_frequency, - CN_DEVICE_ATTRIBUTE_CLUSTER_CLOCK_RATE, - ordinal)); + CN_DEVICE_ATTRIBUTE_CLUSTER_CLOCK_RATE, + ordinal)); VLOG(4) << "MLU Core Frequency = " << (double)ipu_frequency / 1000 / 1000 << " GHz"; return ipu_frequency; @@ -1389,12 +1383,13 @@ void Executor::saveInputWithStride() { // when tensor has stride, and input is reused, need use input to init // baselineOutput. if (!ts->stride.empty() && flag_input_reuse_) { + size_t cpu_dtype_size; + MLUOP_CHECK(mluOpGetSizeOfDataType(ts->dtype, &cpu_dtype_size)); cpu_fp32_stride_input_.push_back( (float *)cpu_runtime_.allocate(ts->total_count * sizeof(float))); - void *temp_gpu = cpu_runtime_.allocate( - ts->total_count * mluop::getSizeOfDataType(ts->dtype)); + void *temp_gpu = cpu_runtime_.allocate(ts->total_count * cpu_dtype_size); memcpy(temp_gpu, data_vector_[i].host_ptr, - ts->total_count * mluop::getSizeOfDataType(ts->dtype)); + ts->total_count * cpu_dtype_size); // BUG(zhaolianshui): the last allocated cpu_fp32_stride_input may not be // i'th; always float? castDataOut(temp_gpu, ts->dtype, cpu_fp32_stride_input_[i], @@ -1413,10 +1408,12 @@ void Executor::saveInputWithStrideByDtype() { // TODO(None): use is_input_and_output, now it is bug // TODO(None): move it to class Stride if (!ts->stride.empty() && flag_input_reuse_) { - cpu_stride_input_.push_back((float *)cpu_runtime_.allocate( - ts->total_count * mluop::getSizeOfDataType(ts->dtype))); + size_t cpu_dtype_size; + MLUOP_CHECK(mluOpGetSizeOfDataType(ts->dtype, &cpu_dtype_size)); + cpu_stride_input_.push_back( + (float *)cpu_runtime_.allocate(ts->total_count * cpu_dtype_size)); memcpy(cpu_stride_input_[i], data_vector_[i].host_ptr, - ts->total_count * mluop::getSizeOfDataType(ts->dtype)); + ts->total_count * cpu_dtype_size); } } } @@ -1453,7 +1450,9 @@ void Executor::baselineInputMalloc() { } // malloc a ptr with stride, to get random value // if this tensor has stride, will stride_in in castIn() - size_t cpu_dtype_size = mluop::getSizeOfDataType(getCpuDtype(ts->dtype)); + size_t cpu_dtype_size; + MLUOP_CHECK( + mluOpGetSizeOfDataType(getCpuDtype(ts->dtype), &cpu_dtype_size)); ts->cpu_ptr = (float *)cpu_runtime_.allocate( ts->total_count * cpu_dtype_size, ts->name); cpu_fp32_input_.push_back(ts->cpu_ptr); @@ -1701,15 +1700,15 @@ void Executor::deviceFree() noexcept { // TODO(None): group those device ptrs into a vector? if (data_vector_[i].device_origin_ptr != nullptr) { EXPECT_EQ(mlu_runtime_.deallocate(data_vector_[i].device_origin_ptr), - CNRT_RET_SUCCESS); + cnrtSuccess); } if (data_vector_[i].device_perf_ptr != nullptr && needDevPerfSpace()) { EXPECT_EQ(mlu_runtime_.deallocate(data_vector_[i].device_perf_ptr), - CNRT_RET_SUCCESS); + cnrtSuccess); } if (data_vector_[i].device_perf_data_ptr != nullptr) { EXPECT_EQ(mlu_runtime_.deallocate(data_vector_[i].device_perf_data_ptr), - CNRT_RET_SUCCESS); + cnrtSuccess); } } if (needDevRandomSpace()) { @@ -1726,7 +1725,7 @@ void Executor::deviceFree() noexcept { << "MLU Memory leaked that should be deallocate by user explicitly" << "(case: " << eva_res_.case_path << ")"; } - EXPECT_EQ(mlu_runtime_.destroy(), CNRT_RET_SUCCESS); + EXPECT_EQ(mlu_runtime_.destroy(), cnrtSuccess); } void Executor::freeLLC() { @@ -1811,7 +1810,9 @@ void Executor::castDataIn(float *src_data, mluOpDataType_t src_dtype, return; } if (src_dtype == dst_dtype) { - memcpy(dst_data, src_data, count * mluop::getSizeOfDataType(src_dtype)); + size_t dtype_size; + MLUOP_CHECK(mluOpGetSizeOfDataType(src_dtype, &dtype_size)); + memcpy(dst_data, src_data, count * dtype_size); } else if ((src_dtype == MLUOP_DTYPE_FLOAT && dst_dtype == MLUOP_DTYPE_INT8) || (src_dtype == MLUOP_DTYPE_FLOAT && @@ -1884,14 +1885,17 @@ void Executor::cnrtCastDataTypeWrap(void *src_data, char *src = reinterpret_cast(src_data); char *dst = reinterpret_cast(dst_data); for (size_t i = 0; i < count_repeat; ++i) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtCastDataType_V2(src, in_dtype, dst, out_dtype, INT_MAX, quant_param, cnrtRounding_rm)); - src += INT_MAX * mluop::getSizeOfDataType(src_dtype); - dst += INT_MAX * mluop::getSizeOfDataType(dst_dtype); + size_t src_dtype_size, dst_dtype_size; + MLUOP_CHECK(mluOpGetSizeOfDataType(src_dtype, &src_dtype_size)); + MLUOP_CHECK(mluOpGetSizeOfDataType(dst_dtype, &dst_dtype_size)); + src += INT_MAX * src_dtype_size; + dst += INT_MAX * dst_dtype_size; } if (count_remain) { - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtCastDataType_V2(src, in_dtype, dst, out_dtype, count_remain, quant_param, cnrtRounding_rm)); } @@ -1907,7 +1911,9 @@ void Executor::castDataOut(void *src_data, mluOpDataType_t src_dtype, return; } if (src_dtype == dst_dtype) { - memcpy(dst_data, src_data, count * mluop::getSizeOfDataType(src_dtype)); + size_t src_dtype_size; + MLUOP_CHECK(mluOpGetSizeOfDataType(src_dtype, &src_dtype_size)); + memcpy(dst_data, src_data, count * src_dtype_size); } else if (src_dtype == MLUOP_DTYPE_COMPLEX_HALF && dst_dtype == MLUOP_DTYPE_COMPLEX_FLOAT) { arrayCastFloatAndNormalWrapper(src_data, src_dtype, dst_data, dst_dtype, @@ -1924,11 +1930,11 @@ void Executor::castDataOut(void *src_data, mluOpDataType_t src_dtype, // need quant if (flag_quant_mode_ != POS_SCALE_OFFSET) { cnrtQuantizedParam_t quant_param = nullptr; - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtCreateQuantizedParam(&quant_param, pos, scale, offset)); cnrtCastDataTypeWrap(src_data, src_dtype, dst_data, dst_dtype, count, quant_param); - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtDestroyQuantizedParam(quant_param)); + GTEST_CHECK(cnrtSuccess == cnrtDestroyQuantizedParam(quant_param)); } else { if (src_dtype == MLUOP_DTYPE_INT8) { MLUOP_CHECK(mluop::castFixedToFloat32((int8_t *)src_data, dst_data, @@ -1976,12 +1982,13 @@ void Executor::quantizeTensorByChannel( float *scale = (float *)malloc(co * sizeof(float)); int *offset = (int *)malloc(co * sizeof(int)); int deal_count = count / co; + size_t dst_dtype_size; + MLUOP_CHECK(mluOpGetSizeOfDataType(dst_dtype, &dst_dtype_size)); for (int co_index = 0; co_index < co; ++co_index) { - castDataIn( - src_data + deal_count, MLUOP_DTYPE_FLOAT, - (char *)dst_data + deal_count * mluop::getSizeOfDataType(dst_dtype), - dst_dtype, deal_count, flag_quant_mode_, position + co_index, - scale + co_index, offset + co_index, true); + castDataIn(src_data + deal_count, MLUOP_DTYPE_FLOAT, + (char *)dst_data + deal_count * dst_dtype_size, dst_dtype, + deal_count, flag_quant_mode_, position + co_index, + scale + co_index, offset + co_index, true); } // MLUOP_CHECK(mluOpSetTensorDescriptorPositionScaleOffsetByChannel( // tensor_desc_[tensor_index].tensor, co, position, scale, offset)); @@ -2044,8 +2051,9 @@ void Executor::castIn() { online_quantize); // p/s, discarded. // get oc_dt's p/s and set to tensor. - void *temp = cpu_runtime_.allocate(ts->total_count * - mluop::getSizeOfDataType(ts->oc_dt)); + size_t dtype_size; + MLUOP_CHECK(mluOpGetSizeOfDataType(ts->oc_dt, &dtype_size)); + void *temp = cpu_runtime_.allocate(ts->total_count * dtype_size); castDataIn(src_data, MLUOP_DTYPE_FLOAT, // src data temp, ts->oc_dt, // dst data ts->total_count, // count @@ -2058,7 +2066,9 @@ void Executor::castIn() { } if (!ts->stride.empty()) { VLOG(4) << "Executor: " << ts->name << " host ptr been strided_out."; - size_t cpu_dtype_size = mluop::getSizeOfDataType(getCpuDtype(ts->dtype)); + size_t cpu_dtype_size; + MLUOP_CHECK( + mluOpGetSizeOfDataType(getCpuDtype(ts->dtype), &cpu_dtype_size)); void *temp = cpu_runtime_.allocate(ts->shape_count * cpu_dtype_size); memset(temp, 0x0, ts->shape_count * cpu_dtype_size); if (flag_input_reuse_) { @@ -2119,16 +2129,16 @@ void Executor::copyIn() { // copy in the same queue memcpy host to dev if (zero_input_) { VLOG(4) << "set device_origin_ptr space to 0"; - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMemset(db->device_origin_ptr, 0, db->size)); } else if (mlu_need_host_data) { // use_real_data: a) compute diff; b) data_dependent (otherwise maybe // runtime error ) auto t_a = std::chrono::system_clock::now(); // host to dev for compute - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMemcpy(db->device_origin_ptr, + GTEST_CHECK(cnrtSuccess == cnrtMemcpy(db->device_origin_ptr, db->host_ptr, db->size, - CNRT_MEM_TRANS_DIR_HOST2DEV)); + cnrtMemcpyHostToDev)); auto t_b = std::chrono::system_clock::now(); auto dur = std::chrono::duration_cast(t_b - t_a); @@ -2136,7 +2146,7 @@ void Executor::copyIn() { } if (needDevPerfDataSpace()) { VLOG(4) << "copy from device_origin_ptr to device_perf_data_ptr"; - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMemcpy(db->device_perf_data_ptr, db->device_origin_ptr, db->size, CNRT_MEM_TRANS_DIR_DEV2DEV)); } @@ -2160,7 +2170,7 @@ void Executor::copyIn() { } // set zeros to dev auto t_a = std::chrono::system_clock::now(); - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMemset(db->device_origin_ptr, 0, db->size)); auto t_b = std::chrono::system_clock::now(); auto dur = @@ -2171,7 +2181,7 @@ void Executor::copyIn() { // set to 0 if (needDevPerfDataSpace()) { // set zeros to dev for perf test - GTEST_CHECK(CNRT_RET_SUCCESS == + GTEST_CHECK(cnrtSuccess == cnrtMemset(db->device_perf_data_ptr, 0, db->size)); } } @@ -2192,9 +2202,9 @@ void Executor::copyOut() { // memcpy dev to host auto t_a = std::chrono::system_clock::now(); - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMemcpy(db->host_ptr, + GTEST_CHECK(cnrtSuccess == cnrtMemcpy(db->host_ptr, db->device_origin_ptr, db->size, - CNRT_MEM_TRANS_DIR_DEV2HOST)); + cnrtMemcpyDevToHost)); auto t_b = std::chrono::system_clock::now(); auto dur = std::chrono::duration_cast(t_b - t_a); eva_res_.mlu.d2h_time += dur.count(); @@ -2247,9 +2257,11 @@ void Executor::castHalfOuput() { "After cpuCompute, " "complex_float->complex_half->complex_float conversion is " "not implemented yet."); + size_t dtype_size; + MLUOP_CHECK(mluOpGetSizeOfDataType(ts->dtype, &dtype_size)); if (ts->dtype == MLUOP_DTYPE_HALF) { - int16_t *half_data = (int16_t *)cpu_runtime_.allocate( - ts->shape_count * mluop::getSizeOfDataType(ts->dtype)); + int16_t *half_data = + (int16_t *)cpu_runtime_.allocate(ts->shape_count * dtype_size); arrayCastFloatToHalf(half_data, cpu_fp32_output_[i], ts->shape_count); if (getFlagHalfInfTo65504()) { arrayCastHalfToFloatInvalidInf(cpu_fp32_output_[i], half_data, @@ -2259,8 +2271,8 @@ void Executor::castHalfOuput() { } cpu_runtime_.deallocate(half_data); } else if (ts->dtype == MLUOP_DTYPE_BFLOAT16) { - uint16_t *bf16_data = (uint16_t *)cpu_runtime_.allocate( - ts->shape_count * mluop::getSizeOfDataType(ts->dtype)); + uint16_t *bf16_data = + (uint16_t *)cpu_runtime_.allocate(ts->shape_count * dtype_size); arrayCastFloatToBF16(bf16_data, cpu_fp32_output_[i], ts->shape_count); arrayCastBF16ToFloat(cpu_fp32_output_[i], bf16_data, ts->shape_count); cpu_runtime_.deallocate(bf16_data); @@ -2511,11 +2523,12 @@ void Executor::baselineOutputMallocByDtype() { cpu_output_.emplace_back(nullptr); continue; } - void *temp_ptr = (void *)cpu_runtime_.allocate( - ts->shape_count * mluop::getSizeOfDataType(ts->dtype), ts->name); + size_t dtype_size; + MLUOP_CHECK(mluOpGetSizeOfDataType(ts->dtype, &dtype_size)); + void *temp_ptr = + (void *)cpu_runtime_.allocate(ts->shape_count * dtype_size, ts->name); cpu_output_.emplace_back(temp_ptr); - memset(temp_ptr, 0x0, - ts->shape_count * mluop::getSizeOfDataType(ts->dtype)); + memset(temp_ptr, 0x0, ts->shape_count * dtype_size); } } diff --git a/test/mlu_op_gtest/pb_gtest/src/executor_float_old.cpp b/test/mlu_op_gtest/pb_gtest/src/executor_float_old.cpp index ca6f497d5..ce7eb1025 100644 --- a/test/mlu_op_gtest/pb_gtest/src/executor_float_old.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/executor_float_old.cpp @@ -103,7 +103,9 @@ void Executor::baselineOutputMalloc() { cpu_fp32_output_.push_back(nullptr); continue; } - size_t cpu_dtype_size = mluop::getSizeOfDataType(getCpuDtype(ts->dtype)); + size_t cpu_dtype_size; + MLUOP_CHECK( + mluOpGetSizeOfDataType(getCpuDtype(ts->dtype), &cpu_dtype_size)); ts->cpu_ptr = (float *)cpu_runtime_.allocate( ts->shape_count * cpu_dtype_size, ts->name); cpu_fp32_output_.push_back(ts->cpu_ptr); @@ -121,7 +123,9 @@ void Executor::mluOutputMalloc() { mlu_fp32_output_.push_back(nullptr); continue; } - size_t cpu_dtype_size = mluop::getSizeOfDataType(getCpuDtype(ts->dtype)); + size_t cpu_dtype_size; + MLUOP_CHECK( + mluOpGetSizeOfDataType(getCpuDtype(ts->dtype), &cpu_dtype_size)); void *temp = cpu_runtime_.allocate(ts->total_count * cpu_dtype_size, ts->name); mlu_fp32_output_.push_back((float *)temp); @@ -136,7 +140,9 @@ void Executor::strideOutput() { if (!ts->stride.empty()) { // TODO(None): 2023-7-13: fix here VLOG(4) << "[WARNING] Executor: " << ts->name << " cpu ptr been strided_out."; - size_t cpu_dtype_size = mluop::getSizeOfDataType(getCpuDtype(ts->dtype)); + size_t cpu_dtype_size; + MLUOP_CHECK( + mluOpGetSizeOfDataType(getCpuDtype(ts->dtype), &cpu_dtype_size)); void *temp = cpu_runtime_.allocate(ts->total_count * cpu_dtype_size); if (!flag_input_reuse_) { // TODO(None): fix after zhaolianshui // fix is_output @@ -146,9 +152,9 @@ void Executor::strideOutput() { for (int i = 0; i < data_vector_.size(); i++) { // BUG(zhaolianshui): wrong, always get to the first one if (data_vector_[i].is_output()) { - memcpy(temp, cpu_fp32_stride_input_[i], - ts->total_count * - cpu_dtype_size); // TODO(None): cpu_stride? + memcpy( + temp, cpu_fp32_stride_input_[i], + ts->total_count * cpu_dtype_size); // TODO(None): cpu_stride? break; } } diff --git a/test/mlu_op_gtest/pb_gtest/src/gtest/mlu_op_gtest.cpp b/test/mlu_op_gtest/pb_gtest/src/gtest/mlu_op_gtest.cpp index 9528e659e..b9740f33f 100644 --- a/test/mlu_op_gtest/pb_gtest/src/gtest/mlu_op_gtest.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/gtest/mlu_op_gtest.cpp @@ -249,7 +249,7 @@ void TestSuite::ThreadX() { // in current thread, We should ensure different thread access the same device // when we use default queue. Sadly, CNRT have other multithread restriction, // we may need to write new thread model. - ASSERT_EQ(cnrtSetDevice(global_var.dev_id_), CNRT_RET_SUCCESS); + ASSERT_EQ(cnrtSetDevice(global_var.dev_id_), cnrtSuccess); size_t thread_num = global_var.thread_num_; size_t max_exe_vec_num = thread_num * 1.5; @@ -262,7 +262,7 @@ void TestSuite::ThreadX() { auto it = ctx->been_initialized.find(std::this_thread::get_id()); if (it == ctx->been_initialized .end()) { // if current thread has not been set device. - ASSERT_EQ(cnrtSetDevice(global_var.dev_id_), CNRT_RET_SUCCESS); + ASSERT_EQ(cnrtSetDevice(global_var.dev_id_), cnrtSuccess); ctx->been_initialized.insert(std::this_thread::get_id()); } }; diff --git a/test/mlu_op_gtest/pb_gtest/src/gtest/test_env.cpp b/test/mlu_op_gtest/pb_gtest/src/gtest/test_env.cpp index c6d884ffc..4e23b6f9d 100644 --- a/test/mlu_op_gtest/pb_gtest/src/gtest/test_env.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/gtest/test_env.cpp @@ -389,7 +389,7 @@ bool TestEnvironment::getBusId(int device_id, std::string &bus_id_str) { VLOG(4) << "Device " << device_id << "'s mlu id=" << str; } else { cnrtRet_t ret = cnrtDeviceGetPCIBusId(str, 100, device_id); - if (ret != CNRT_RET_SUCCESS) { + if (ret != cnrtSuccess) { LOG(WARNING) << "Fail to get device " << device_id << "'s bus id."; return false; } else { @@ -444,7 +444,7 @@ bool TestEnvironment::getComputeMode(std::string bus_id_str, char &mode) { void TestEnvironment::setDevice() { // 1.get device num unsigned int dev_num = 0; - ASSERT_EQ(cnrtGetDeviceCount(&dev_num), CNRT_RET_SUCCESS); + ASSERT_EQ(cnrtGetDeviceCount(&dev_num), cnrtSuccess); if (dev_num <= 0) { // dev_num_ should > 0 FAIL() << "Can't find device."; } else { diff --git a/test/mlu_op_gtest/pb_gtest/src/hardware_monitor.cpp b/test/mlu_op_gtest/pb_gtest/src/hardware_monitor.cpp index 4472f28c0..0e5f808ec 100644 --- a/test/mlu_op_gtest/pb_gtest/src/hardware_monitor.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/hardware_monitor.cpp @@ -48,7 +48,7 @@ hardwareMonitor::~hardwareMonitor() { void hardwareMonitor::setDevice() const { GTEST_CHECK(cndevInit(0) == CNDEV_SUCCESS); - ASSERT_EQ(cnrtSetDevice(global_var.dev_id_), CNRT_RET_SUCCESS); + ASSERT_EQ(cnrtSetDevice(global_var.dev_id_), cnrtSuccess); } void hardwareMonitor::start() { @@ -130,7 +130,7 @@ void hardwareMonitor::monitorFrequencyOneGRepeat() { std::ios::app); frequency_file << "relative_time(ns),IPU_frequency(MHz)\n"; cndevDevice_t dev_id; - GTEST_CHECK(cnrtGetDevice(&dev_id) == CNRT_RET_SUCCESS); + GTEST_CHECK(cnrtGetDevice(&dev_id) == cnrtSuccess); int i = 1; cndevFrequencyInfo_t freq_info_prev, freq_info_curr; @@ -172,7 +172,7 @@ void hardwareMonitor::monitorPowerOneGRepeat() { std::ios::app); power_file << "relative_time(ns),instantaneous_power(W),average_power(W)\n"; cndevDevice_t dev_id; - GTEST_CHECK(cnrtGetDevice(&dev_id) == CNRT_RET_SUCCESS); + GTEST_CHECK(cnrtGetDevice(&dev_id) == cnrtSuccess); GTEST_CHECK(cndevInit(0) == CNDEV_SUCCESS); int i = 1; diff --git a/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_llc/fill_llc.mlu b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_llc/fill_llc.mlu index 750145a32..7e57910bd 100644 --- a/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_llc/fill_llc.mlu +++ b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_llc/fill_llc.mlu @@ -30,7 +30,7 @@ mluOpStatus_t mluOpFillLLC(mluOpHandle_t handle, void *const_addr, return MLUOP_STATUS_SUCCESS; } cnrtDim3_t k_dim{1, 1, 1}; - cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_BLOCK; + cnrtFunctionType_t k_type = cnrtFuncTypeBlock; KERNEL_CHECK( (flushLLC<<queue>>>(const_addr, llc_size))); return MLUOP_STATUS_SUCCESS; diff --git a/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_llc/fill_llc_device.mlu b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_llc/fill_llc_device.mlu index fc61a2819..c5b9077e7 100644 --- a/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_llc/fill_llc_device.mlu +++ b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_llc/fill_llc_device.mlu @@ -29,7 +29,7 @@ __mlu_global__ void flushLLC(void* input, int fill_bytes) { return; } int size_once = 256 * 1024; - __nram__ char buf[256 * 1024]; + __nram__ int8_t buf[256 * 1024]; int repeat = fill_bytes / size_once; for (int i = 0; i < repeat; i++) { __asm__ volatile(".volatile{ld.3d.async.nram.gdram.scmlock [%[dst]], [%[src]], \ diff --git a/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_ram/fill_ram.mlu b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_ram/fill_ram.mlu index a49a8a0ea..3b2a7d34d 100644 --- a/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_ram/fill_ram.mlu +++ b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_ram/fill_ram.mlu @@ -33,9 +33,9 @@ static void policyFunc(const mluOpHandle_t handle, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) { #if TARGET_MLU_ARCH == 520 - *k_type = CNRT_FUNC_TYPE_BLOCK; + *k_type = cnrtFuncTypeBlock; #else - *k_type = CNRT_FUNC_TYPE_UNION1; + *k_type = cnrtFuncTypeUnion1; #endif k_dim->x = mluop::runtime::getCoreNumOfEachUnionCapability(handle); k_dim->y = mluop::runtime::getClusterLimitCapability(handle); diff --git a/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_ram/fill_ram_device.mlu b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_ram/fill_ram_device.mlu index 6ceed1673..91d2441f7 100644 --- a/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_ram/fill_ram_device.mlu +++ b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_ram/fill_ram_device.mlu @@ -29,12 +29,12 @@ #define FILL_SRAM_SIZE (MAX_SRAM_SIZE + REM_FOR_STACK - 4096) #define ONCE_WRAM (256 * 1024) #define ONCE_WRAM_STRIDE (ONCE_WRAM / LT_NUM) -__nram__ char nram_buffer[FILL_NRAM_SIZE]; -__wram__ char wram_buffer[MAX_WRAM_SIZE]; +__nram__ int8_t nram_buffer[FILL_NRAM_SIZE]; +__wram__ int8_t wram_buffer[MAX_WRAM_SIZE]; template __mlu_func__ void fillRamBase(const int &fill_value, - char *sram_buffer) { + int8_t *sram_buffer) { T *tmp = (T *)&fill_value; __bang_write_value(nram_buffer, FILL_NRAM_SIZE / sizeof(T), (T)*tmp); #if __BANG_ARCH__ == 520 // use nram to fill wram @@ -42,7 +42,7 @@ __mlu_func__ void fillRamBase(const int &fill_value, uint32_t last_size = MAX_WRAM_SIZE - (wram_repeat - 1) * ONCE_WRAM; for (int i = 0; i < wram_repeat; ++i) { uint32_t size = i == wram_repeat - 1 ? last_size : ONCE_WRAM; - char *wram = wram_buffer + i * ONCE_WRAM_STRIDE; + int8_t *wram = wram_buffer + i * ONCE_WRAM_STRIDE; __memcpy(wram, nram_buffer, size, NRAM2WRAM); } #else // use sram to fill wram @@ -61,9 +61,9 @@ __mlu_func__ void fillRamBase(const int &fill_value, __mlu_global__ void MLUBlockKernelFillRam(nram_value value) { #if __BANG_ARCH__ != 520 - __mlu_shared__ char sram_buffer[FILL_SRAM_SIZE]; + __mlu_shared__ int8_t sram_buffer[FILL_SRAM_SIZE]; #else - char *sram_buffer = NULL; + int8_t *sram_buffer = NULL; #endif const int nan_half_value = 0xffff; diff --git a/test/mlu_op_gtest/pb_gtest/src/memory_pool.cpp b/test/mlu_op_gtest/pb_gtest/src/memory_pool.cpp index d9f8087e5..5fc44c01e 100644 --- a/test/mlu_op_gtest/pb_gtest/src/memory_pool.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/memory_pool.cpp @@ -276,7 +276,7 @@ void *MLUMemoryPool::allocate(size_t num_bytes, const std::string &name) { #endif while (maxFreeSize > 0) { auto result = cnrtMalloc(&ptr, maxFreeSize); - if (CNRT_RET_SUCCESS == result) { + if (cnrtSuccess == result) { cnGetMemAttribute((void *)&is_linear, CN_MEM_ATTRIBUTE_ISLINEAR, (CNaddr)ptr); if (is_linear) @@ -288,7 +288,7 @@ void *MLUMemoryPool::allocate(size_t num_bytes, const std::string &name) { } num_bytes = maxFreeSize; } else { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMalloc(&ptr, num_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMalloc(&ptr, num_bytes)); } ctx_->chunks.emplace_back(Chunk(num_bytes, num_bytes, ptr)); @@ -300,7 +300,7 @@ void *MLUMemoryPool::allocate(size_t num_bytes, const std::string &name) { void MLUMemoryPool::deallocate(void *ptr) { for (auto it = ctx_->chunks.begin(); it != ctx_->chunks.end(); ++it) { if ((*it).ptr == ptr) { - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtFree(ptr)); + GTEST_CHECK(cnrtSuccess == cnrtFree(ptr)); ctx_->total_allocated_size -= (*it).allocated_size; it = ctx_->chunks.erase(it); return; @@ -311,7 +311,7 @@ void MLUMemoryPool::deallocate(void *ptr) { void MLUMemoryPool::destroy() { for (auto it = ctx_->chunks.begin(); it != ctx_->chunks.end(); ++it) { if (it->ptr != nullptr) { - GTEST_WARNING(CNRT_RET_SUCCESS == cnrtFree(it->ptr)); + GTEST_WARNING(cnrtSuccess == cnrtFree(it->ptr)); it->ptr = nullptr; ctx_->total_allocated_size -= it->allocated_size; } diff --git a/test/mlu_op_gtest/pb_gtest/src/parser.cpp b/test/mlu_op_gtest/pb_gtest/src/parser.cpp index f02c3662b..6ac6657fb 100644 --- a/test/mlu_op_gtest/pb_gtest/src/parser.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/parser.cpp @@ -268,8 +268,10 @@ void Parser::parse(const std::string &file) { mt->total_count = getTensorStrideCount(pt, mt->value_type); // shape_count come from value_f/value_i/value_h/value_ui/value_ul and // shape. not include stride + size_t dtype_size; + MLUOP_CHECK(mluOpGetSizeOfDataType(mt->dtype, &dtype_size)); mt->shape_count = getTensorShapeCount(pt); - mt->sizeof_dtype = mluop::getSizeOfDataType(mt->dtype); + mt->sizeof_dtype = dtype_size; mt->size_in_bytes = mt->total_count * mt->sizeof_dtype; if (mt->total_count != mt->shape_count) { VLOG(4) << "WARNING: Parser: the " << mt->name diff --git a/test/mlu_op_gtest/pb_gtest/src/runtime.cpp b/test/mlu_op_gtest/pb_gtest/src/runtime.cpp index 8374ce02a..53ccd4964 100644 --- a/test/mlu_op_gtest/pb_gtest/src/runtime.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/runtime.cpp @@ -39,7 +39,7 @@ CPURuntime::CPURuntime() {} CPURuntime::~CPURuntime() {} // all member variable are shared_ptr. -cnrtRet_t CPURuntime::destroy() { return CNRT_RET_SUCCESS; } +cnrtRet_t CPURuntime::destroy() { return cnrtSuccess; } void *CPURuntime::allocate(void *ptr, std::string name) { if (ptr == NULL) { @@ -119,12 +119,12 @@ bool MLURuntime::checkOneMemBlock(const struct MemBlock &mem_block) { char *footer = header + mem_block.raw_bytes - mask_bytes_ - mem_block.unalign_address_offset; GTEST_CHECK( - CNRT_RET_SUCCESS == cnrtMemcpy((void *)header_check_.get(), header, - mask_bytes_, CNRT_MEM_TRANS_DIR_DEV2HOST), + cnrtSuccess == cnrtMemcpy((void *)header_check_.get(), header, + mask_bytes_, cnrtMemcpyDevToHost), "MLURuntime: memcpy device to host failed when check overwritten"); GTEST_CHECK( - CNRT_RET_SUCCESS == cnrtMemcpy((void *)footer_check_.get(), footer, - mask_bytes_, CNRT_MEM_TRANS_DIR_DEV2HOST), + cnrtSuccess == cnrtMemcpy((void *)footer_check_.get(), footer, + mask_bytes_, cnrtMemcpyDevToHost), "MLURuntime: memcpy device to host failed when check overwritten"); if (!check_byte((void *)header_check_.get(), (void *)header_mask_.get(), @@ -147,11 +147,11 @@ bool MLURuntime::checkOneMemBlock(const struct MemBlock &mem_block) { MLURuntime::~MLURuntime() {} bool MLURuntime::freeOneMemBlock(const struct MemBlock &mem_block) { - cnrtRet_t ret = CNRT_RET_SUCCESS; + cnrtRet_t ret = cnrtSuccess; bool ok = true; char *header = mem_block.header; ret = cnrtFree(header - mem_block.unalign_address_offset); - if (ret != CNRT_RET_SUCCESS) { + if (ret != cnrtSuccess) { ADD_FAILURE() << "MLURuntime: free mlu memory failed. Addr = " << (void *)header; ok = false; @@ -161,7 +161,7 @@ bool MLURuntime::freeOneMemBlock(const struct MemBlock &mem_block) { } cnrtRet_t MLURuntime::destroy() { - cnrtRet_t ret = CNRT_RET_SUCCESS; + cnrtRet_t ret = cnrtSuccess; bool ok = true; for (auto mem_block : memory_blocks_) { ok = ok && (freeOneMemBlock(mem_block)); @@ -213,7 +213,7 @@ void *MLURuntime::allocate(size_t num_bytes, std::string name, if (true == check_enable_) { raw_bytes += 2 * mask_bytes_; } - cnrtRet_t ret = CNRT_RET_SUCCESS; + cnrtRet_t ret = cnrtSuccess; if (!const_dram) { VLOG(4) << "memory allocated by cnrtMalloc"; ret = cnrtMalloc((void **)&raw_addr, raw_bytes); @@ -222,7 +222,7 @@ void *MLURuntime::allocate(size_t num_bytes, std::string name, ret = cnrtMallocConstant((void **)&raw_addr, raw_bytes); } printLinearMemoryMsg(raw_addr, raw_bytes); - if (raw_addr == NULL || ret != CNRT_RET_SUCCESS) { + if (raw_addr == NULL || ret != cnrtSuccess) { LOG(ERROR) << "MLURuntime: Failed to allocate " << num_bytes << " bytes."; throw std::invalid_argument(std::string(__FILE__) + " +" + std::to_string(__LINE__)); @@ -245,8 +245,8 @@ void *MLURuntime::allocate(size_t num_bytes, std::string name, #endif ret = cnrtMemcpy(header, (void *)header_mask_.get(), mask_bytes_, - CNRT_MEM_TRANS_DIR_HOST2DEV); - if (ret != CNRT_RET_SUCCESS) { + cnrtMemcpyHostToDev); + if (ret != cnrtSuccess) { LOG(ERROR) << "MLURuntime: Failed to copy header " << num_bytes << " bytes."; throw std::invalid_argument(std::string(__FILE__) + " +" + @@ -255,8 +255,8 @@ void *MLURuntime::allocate(size_t num_bytes, std::string name, } ret = cnrtMemcpy(footer, (void *)footer_mask_.get(), mask_bytes_, - CNRT_MEM_TRANS_DIR_HOST2DEV); - if (ret != CNRT_RET_SUCCESS) { + cnrtMemcpyHostToDev); + if (ret != cnrtSuccess) { LOG(ERROR) << "MLURuntime: Failed to copy footer " << num_bytes << " bytes."; throw std::invalid_argument(std::string(__FILE__) + " +" + @@ -275,13 +275,13 @@ void *MLURuntime::allocate(size_t num_bytes, std::string name, cnrtRet_t MLURuntime::deallocate(void *mlu_addr) { if (mlu_addr == NULL) { - return CNRT_RET_SUCCESS; + return cnrtSuccess; } char *header = (char *)mlu_addr; if (true == check_enable_) { header = header - mask_bytes_; } - cnrtRet_t ret = CNRT_RET_SUCCESS; + cnrtRet_t ret = cnrtSuccess; // get header and footer auto it = std::find_if(memory_blocks_.begin(), memory_blocks_.end(), [=](MemBlock b) { return b.header == header; }); diff --git a/test/mlu_op_gtest/pb_gtest/src/stride.cpp b/test/mlu_op_gtest/pb_gtest/src/stride.cpp index 1daebec68..30333bdc0 100644 --- a/test/mlu_op_gtest/pb_gtest/src/stride.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/stride.cpp @@ -137,7 +137,8 @@ void *Stride::strideOutputByDtype() { return pimpl_->strideOutputByDtype(); } void *Stride::StrideImpl::strideOutputByDtype() { if (have_stride_) { - size_t dtype_size = mluop::getSizeOfDataType(ts_->dtype); + size_t dtype_size; + MLUOP_CHECK(mluOpGetSizeOfDataType(ts_->dtype, &dtype_size)); void *tensor_out = cpu_runtime_->allocate(ts_->total_count * dtype_size); if (init_by_input_) { memcpy(tensor_out, tensor_copy_, ts_->total_count * dtype_size); diff --git a/test/mlu_op_gtest/pb_gtest/src/tools.cpp b/test/mlu_op_gtest/pb_gtest/src/tools.cpp index ee2e94a39..de1873903 100644 --- a/test/mlu_op_gtest/pb_gtest/src/tools.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/tools.cpp @@ -347,37 +347,6 @@ void saveHexDataToFile(const std::string &file, void *data, fout.close(); } -cnrtDataType_t cvtMluOpDtypeToCnrt(mluOpDataType_t dtype) { - switch (dtype) { - case MLUOP_DTYPE_HALF: - return CNRT_FLOAT16; - case MLUOP_DTYPE_FLOAT: - return CNRT_FLOAT32; - case MLUOP_DTYPE_DOUBLE: - return CNRT_FLOAT64; - case MLUOP_DTYPE_INT8: - return CNRT_INT8; - case MLUOP_DTYPE_INT16: - return CNRT_INT16; - case MLUOP_DTYPE_INT32: - return CNRT_INT32; - case MLUOP_DTYPE_INT64: - return CNRT_INT64; - case MLUOP_DTYPE_BOOL: - return CNRT_BOOL; - case MLUOP_DTYPE_UINT8: - return CNRT_UINT8; - case MLUOP_DTYPE_UINT16: - return CNRT_UINT16; - case MLUOP_DTYPE_UINT32: - return CNRT_UINT32; - default: - LOG(ERROR) << "NOT support this dtype yet"; - throw std::invalid_argument(std::string(__FILE__) + " +" + - std::to_string(__LINE__)); - } -} - cnrtDataType_V2_t cvtMluOpDtypeToCnrt_V2(mluOpDataType_t dtype) { switch (dtype) { case MLUOP_DTYPE_HALF: diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/adam_w/adam_w.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/adam_w/adam_w.cpp index a50de33b6..b0f6f229f 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/adam_w/adam_w.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/adam_w/adam_w.cpp @@ -91,8 +91,8 @@ void AdamWExecutor::setMiscellaneousParam() { } void AdamWExecutor::cpuCompute() { - assert(parser_->getInputNum() == 5); - assert(parser_->getOutputNum() == 4); + GTEST_CHECK(parser_->getInputNum() == 5); + GTEST_CHECK(parser_->getOutputNum() == 4); float lr = parser_->getProtoNode()->adamw_param().lr(); float beta1 = parser_->getProtoNode()->adamw_param().beta1(); float beta2 = parser_->getProtoNode()->adamw_param().beta2(); @@ -110,10 +110,10 @@ void AdamWExecutor::cpuCompute() { auto count4 = parser_->getInputDataCount(3); auto count5 = parser_->getInputDataCount(4); - assert(count1 == count2); - assert(count1 == count3); - assert(count1 == count4); - assert(count1 == count5); + GTEST_CHECK(count1 == count2); + GTEST_CHECK(count1 == count3); + GTEST_CHECK(count1 == count4); + GTEST_CHECK(count1 == count5); auto cpu_tensor_param = cpu_fp32_input_[0]; auto cpu_tensor_paramh = cpu_fp32_input_[1]; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/ball_query/ball_query.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/ball_query/ball_query.cpp index 0b7d98613..0ab6c2774 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/ball_query/ball_query.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/ball_query/ball_query.cpp @@ -52,7 +52,7 @@ void BallQueryExecutor::compute() { auto data_vector = (int *)data_vector_[2].host_ptr; // set idx to 0 size_t output_total_bytes = data_vector_[2].count * sizeof(int32_t); - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMemset(idx_ptr, 0, output_total_bytes)); + GTEST_CHECK(cnrtSuccess == cnrtMemset(idx_ptr, 0, output_total_bytes)); interface_timer_.start(); MLUOP_CHECK(mluOpBallQuery(handle_, new_xyz_desc, new_xyz_ptr, xyz_desc, xyz_ptr, min_radius_, max_radius_, nsample_, diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/box_iou_rotated/box_iou_rotated.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/box_iou_rotated/box_iou_rotated.cpp index 3433b162d..f413b8184 100755 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/box_iou_rotated/box_iou_rotated.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/box_iou_rotated/box_iou_rotated.cpp @@ -325,7 +325,7 @@ template int BoxIouRotatedExecutor::convexHullGraham(const Point (&p)[24], const int &num_in, Point (&q)[24]) { - assert(num_in >= 2); + GTEST_CHECK(num_in >= 2); // Step1: // Find point with minimum y // if more than 1 points have the same minimum y, diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/carafe_forward/carafe_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/carafe_forward/carafe_forward.cpp index a5dfe0ebe..19d6435e9 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/carafe_forward/carafe_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/carafe_forward/carafe_forward.cpp @@ -65,8 +65,8 @@ void CarafeForwardExecutor::compute() { } void CarafeForwardExecutor::cpuCompute() { - assert(parser_->getInputNum() == 2); - assert(parser_->getOutputNum() == 1); + GTEST_CHECK(parser_->getInputNum() == 2); + GTEST_CHECK(parser_->getOutputNum() == 1); auto carafe_desc_node = parser_->getProtoNode()->carafe_param(); @@ -74,9 +74,9 @@ void CarafeForwardExecutor::cpuCompute() { int group_size = carafe_desc_node.group_size(); int scale_factor = carafe_desc_node.scale_factor(); - assert(kernel_size >= 1 && (kernel_size - 1) % 2 == 0); - assert(scale_factor >= 1); - assert(group_size >= 1); + GTEST_CHECK(kernel_size >= 1 && (kernel_size - 1) % 2 == 0); + GTEST_CHECK(scale_factor >= 1); + GTEST_CHECK(group_size >= 1); int half_kernel_size = (kernel_size - 1) / 2; @@ -99,15 +99,15 @@ void CarafeForwardExecutor::cpuCompute() { int output_dimW = mluOpGetTensordimW(output_desc); int output_dimC = mluOpGetTensordimC(output_desc); - assert(input_dimN == mask_dimN); - assert(input_dimN == output_dimN); - assert(input_dimC == output_dimC); - assert(mask_dimC == kernel_size * kernel_size * group_size); - assert(mask_dimH == scale_factor * input_dimH); - assert(mask_dimW == scale_factor * input_dimW); - assert(mask_dimH == output_dimH); - assert(mask_dimW == output_dimW); - assert(input_dimC % group_size == 0); + GTEST_CHECK(input_dimN == mask_dimN); + GTEST_CHECK(input_dimN == output_dimN); + GTEST_CHECK(input_dimC == output_dimC); + GTEST_CHECK(mask_dimC == kernel_size * kernel_size * group_size); + GTEST_CHECK(mask_dimH == scale_factor * input_dimH); + GTEST_CHECK(mask_dimW == scale_factor * input_dimW); + GTEST_CHECK(mask_dimH == output_dimH); + GTEST_CHECK(mask_dimW == output_dimW); + GTEST_CHECK(input_dimC % group_size == 0); int channels_per_group = input_dimC / group_size; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/dcn_backward_weight.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/dcn_backward_weight.cpp index b29e9a525..f8a89520e 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/dcn_backward_weight.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_backward_weight/dcn_backward_weight.cpp @@ -346,6 +346,7 @@ void DcnBackwardWeightExecutor::transpose(float *input, float *output, if (dim_desc > 8 || dim_desc <= 0) { LOG(ERROR) << "dim_desc is " << dim_desc << ", it shoule less than 8 and greater than 0"; + return; } { std::vector().swap(permute_desc); } for (int i = 0; i < dim_num; i++) { @@ -378,7 +379,7 @@ static void BatchMatMul(const int &g, const int &m, const int &k, const int &n, bool is_transa, bool is_transb) { const int batch_size = g; - assert(batch_size >= 1); + GTEST_CHECK(batch_size >= 1); #if USE_OPENBLAS const CBLAS_ORDER Order = CblasRowMajor; const CBLAS_TRANSPOSE TransA = is_transa ? CblasTrans : CblasNoTrans; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/dcn_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/dcn_forward.cpp index 9a210931d..39d56f13e 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/dcn_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/dcn_forward/dcn_forward.cpp @@ -351,6 +351,7 @@ void DcnForwardExecutor::transpose(float *input, float *output, if (dim_desc > 8 || dim_desc <= 0) { LOG(ERROR) << "dim_desc is " << dim_desc << ", it shoule less than 8 and greater than 0"; + return; } { std::vector().swap(permute_desc); } for (int i = 0; i < dim_num; i++) { @@ -382,7 +383,7 @@ static void BatchMatMul(const int &g, const int &m, const int &k, const int &n, const bool is_transa, const bool is_transb) { const int batch_size = g; - assert(batch_size >= 1); + GTEST_CHECK(batch_size >= 1); #if USE_OPENBLAS const CBLAS_ORDER Order = CblasRowMajor; const CBLAS_TRANSPOSE TransA = is_transa ? CblasTrans : CblasNoTrans; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/div/div.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/div/div.cpp index 2bd491fc6..e54ea793a 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/div/div.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/div/div.cpp @@ -116,7 +116,7 @@ void DivExecutor::expand_compute_cpu(std::vector shape_a, } bool can_broadcast = canBroadCast(shape_a, shape_b); - assert(can_broadcast == 1); + GTEST_CHECK(can_broadcast); uint64_t sizeA = 1; uint64_t sizeB = 1; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/focal_loss_sigmoid_backward/focal_loss_sigmoid_backward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/focal_loss_sigmoid_backward/focal_loss_sigmoid_backward.cpp index 588c904ec..19f8753f1 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/focal_loss_sigmoid_backward/focal_loss_sigmoid_backward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/focal_loss_sigmoid_backward/focal_loss_sigmoid_backward.cpp @@ -91,7 +91,7 @@ void FocalLossSigmoidBackwardExecutor::setMiscellaneousParam() { } void FocalLossSigmoidBackwardExecutor::cpuCompute() { - assert(parser_->getOutputNum() == 1); + GTEST_CHECK(parser_->getOutputNum() == 1); auto alpha = parser_->getProtoNode()->focal_loss_sigmoid_backward_param().alpha(); auto gamma = diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/generate_proposals_v2/generate_proposals_v2.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/generate_proposals_v2/generate_proposals_v2.cpp index bfb30769a..da8a3d91a 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/generate_proposals_v2/generate_proposals_v2.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/generate_proposals_v2/generate_proposals_v2.cpp @@ -54,25 +54,25 @@ void GenerateProposalsV2Executor::workspaceMalloc() { void *output_ptr = parser_->getMetaTensor("output1").dev_origin_ptr; size_t output_size = parser_->getMetaTensor("output1").size_in_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMemset(output_ptr, 0, output_size)); + GTEST_CHECK(cnrtSuccess == cnrtMemset(output_ptr, 0, output_size)); void *output_ptr1 = parser_->getMetaTensor("output2").dev_origin_ptr; size_t output_size1 = parser_->getMetaTensor("output2").size_in_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMemset(output_ptr1, 0, output_size1)); + GTEST_CHECK(cnrtSuccess == cnrtMemset(output_ptr1, 0, output_size1)); void *output_ptr2 = parser_->getMetaTensor("output3").dev_origin_ptr; size_t output_size2 = parser_->getMetaTensor("output3").size_in_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMemset(output_ptr2, 0, output_size2)); + GTEST_CHECK(cnrtSuccess == cnrtMemset(output_ptr2, 0, output_size2)); void *output_ptr3 = parser_->getMetaTensor("output4").dev_origin_ptr; size_t output_size3 = parser_->getMetaTensor("output4").size_in_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMemset(output_ptr3, 0, output_size3)); + GTEST_CHECK(cnrtSuccess == cnrtMemset(output_ptr3, 0, output_size3)); } void GenerateProposalsV2Executor::workspaceFree() { if (workspace_[0]) { VLOG(4) << "Free device workspace space."; - GTEST_CHECK(CNRT_RET_SUCCESS == mlu_runtime_.deallocate(workspace_[0])); + GTEST_CHECK(cnrtSuccess == mlu_runtime_.deallocate(workspace_[0])); workspace_[0] = nullptr; } } diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_backward_data/indice_convolution_backward_data.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_backward_data/indice_convolution_backward_data.cpp index a6c6ea3fe..2e3aa4930 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_backward_data/indice_convolution_backward_data.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_backward_data/indice_convolution_backward_data.cpp @@ -252,11 +252,11 @@ void IndiceConvolutionBackwardDataExecutor::cpuTransposeFilter( } void IndiceConvolutionBackwardDataExecutor::cpuCompute() { - assert(parser_->getInputNum() == 3); - assert(parser_->getOutputNum() == 1); + GTEST_CHECK(parser_->getInputNum() == 3); + GTEST_CHECK(parser_->getOutputNum() == 1); VLOG(4) << "compute cpu IndiceConvolutionBackwardData"; auto count = parser_->getOutputDataCount(0); - assert(count != 0); + GTEST_CHECK(count != 0); getFilterDims(); setSpconvdataParams(); int K = kd * kh * kw; @@ -300,7 +300,7 @@ void IndiceConvolutionBackwardDataExecutor::cpuCompute() { for (int kk = 0; kk < K; ++kk) { int filter_offset = kk * dxc * dyc; int index_num = (int)(indice_num_[kk]); - assert(L >= index_num); + GTEST_CHECK(L >= index_num); for (int l = 0; l < index_num; ++l) { // index_pair data loop int input_idx = indice_pairs[kk * 2 * L + l]; int output_idx = indice_pairs[kk * 2 * L + L + l]; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp index a08cb0d2d..e38c7e602 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp @@ -206,12 +206,15 @@ int64_t IndiceConvolutionBackwardFilterExecutor::getTheoryIoSize() { int64_t in_active_num = input_indice_desc_->dims[0]; int64_t kernel_volume = indice_pair_desc_->dims[0]; int64_t theory_ios = 0; - auto input_indice_dwidth = - mluop::getSizeOfDataType(input_indice_desc_->dtype); - auto diffy_indice_dwidth = - mluop::getSizeOfDataType(diffy_indice_desc_->dtype); - auto indice_pair_dwidth = mluop::getSizeOfDataType(indice_pair_desc_->dtype); - auto diffw_dwidth = mluop::getSizeOfDataType(diffw_desc_->dtype); + size_t input_indice_dwidth, diffy_indice_dwidth, indice_pair_dwidth, + diffw_dwidth; + MLUOP_CHECK( + mluOpGetSizeOfDataType(input_indice_desc_->dtype, &input_indice_dwidth)); + MLUOP_CHECK( + mluOpGetSizeOfDataType(diffy_indice_desc_->dtype, &diffy_indice_dwidth)); + MLUOP_CHECK( + mluOpGetSizeOfDataType(indice_pair_desc_->dtype, &indice_pair_dwidth)); + MLUOP_CHECK(mluOpGetSizeOfDataType(diffw_desc_->dtype, &diffw_dwidth)); auto gather_nd_ios = [&](const int64_t kernel_index, const int64_t gather_num, const int64_t channel, diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_forward/indice_convolution_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_forward/indice_convolution_forward.cpp index 0f45e17e7..61a80277e 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_forward/indice_convolution_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/indice_convolution_forward/indice_convolution_forward.cpp @@ -209,10 +209,14 @@ int64_t IndiceConvolutionForwardExecutor::getTheoryIoSize() { int64_t num_active_out = features_out_desc_->dims[0]; int64_t num_filters = indice_pairs_desc_->dims[0]; int64_t theory_ios = 0; - auto features_dwidth = mluop::getSizeOfDataType(features_desc_->dtype); - auto filters_dwidth = mluop::getSizeOfDataType(filters_desc_->dtype); - auto indice_pairs_dwith = mluop::getSizeOfDataType(indice_pairs_desc_->dtype); - auto features_out_dwith = mluop::getSizeOfDataType(features_out_desc_->dtype); + size_t features_dwidth, filters_dwidth, indice_pairs_dwith, + features_out_dwith; + MLUOP_CHECK(mluOpGetSizeOfDataType(features_desc_->dtype, &features_dwidth)); + MLUOP_CHECK(mluOpGetSizeOfDataType(filters_desc_->dtype, &filters_dwidth)); + MLUOP_CHECK( + mluOpGetSizeOfDataType(indice_pairs_desc_->dtype, &indice_pairs_dwith)); + MLUOP_CHECK( + mluOpGetSizeOfDataType(features_out_desc_->dtype, &features_out_dwith)); auto gather_scatter_ios = [&](const int64_t index, const int64_t num, const int64_t channel, diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/lgamma/test_case/case_0.prototxt b/test/mlu_op_gtest/pb_gtest/src/zoo/lgamma/test_case/case_0.prototxt new file mode 100644 index 000000000..cfa485af9 --- /dev/null +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/lgamma/test_case/case_0.prototxt @@ -0,0 +1,32 @@ +op_name: "lgamma" +input { + id: "input" + shape: { + dims: 1024 + dims: 2 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + random_data: { + seed: 23 + upper_bound: 100 + lower_bound: -100 + distribution: UNIFORM + } +} +output { + id: "output" + shape: { + dims: 1024 + dims: 2 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT +} +test_param: { + error_func: DIFF1 + error_func: DIFF2 + error_threshold: 0.003 + error_threshold: 0.003 + baseline_device: CPU +} diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/log/log.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/log/log.cpp index 340d2ab0e..72c81220c 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/log/log.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/log/log.cpp @@ -56,8 +56,8 @@ void LogExecutor::setMiscellaneousParam() { } void LogExecutor::cpuCompute() { - assert(parser_->getInputNum() == 1); - assert(parser_->getOutputNum() == 1); + GTEST_CHECK(parser_->getInputNum() == 1); + GTEST_CHECK(parser_->getOutputNum() == 1); auto count = parser_->getInputDataCount(0); mluOpLogBase_t base = @@ -76,7 +76,7 @@ void LogExecutor::cpuCompute() { cpu_fp32_output_[0][i] = log10(cpu_fp32_input_[0][i]); } } else { - assert(0); + GTEST_CHECK(0); } } diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/logspace/logspace.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/logspace/logspace.cpp new file mode 100644 index 000000000..cbf8b12a0 --- /dev/null +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/logspace/logspace.cpp @@ -0,0 +1,98 @@ +/************************************************************************* + * Copyright (C) [2024] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "logspace.h" + +namespace mluoptest { + +void LogspaceExecutor::initData() { + start_num_ = parser_->getProtoNode()->logspace_param().start(); + end_num_ = parser_->getProtoNode()->logspace_param().end(); + steps_num_ = parser_->getProtoNode()->logspace_param().steps(); + base_num_ = parser_->getProtoNode()->logspace_param().base(); +} + +void LogspaceExecutor::paramCheck() { + GTEST_CHECK(parser_->outputs().size() == 1, + "logspace tensor output number is wrong."); +} + +void LogspaceExecutor::compute() { + VLOG(4) << "LogspaceExecutor compute "; + initData(); + + auto tensor_y = tensor_desc_[1].tensor; + auto dev_y = data_vector_[1].device_ptr; + + VLOG(4) << "call mluOpLogspace()"; + interface_timer_.start(); + MLUOP_CHECK(mluOpLogspace(handle_, start_num_, end_num_, (int64_t)steps_num_, + base_num_, tensor_y, dev_y)); + interface_timer_.stop(); +} + +void LogspaceExecutor::cpuCompute() { + if (steps_num_ == 1) { + cpu_fp32_output_[0][0] = (half)::powf(base_num_, start_num_); + } else { + auto count = parser_->output(0)->shape_count; + float step = (end_num_ - start_num_) / (steps_num_ - 1); + + switch (tensor_desc_[1].tensor->dtype) { + case MLUOP_DTYPE_FLOAT: { + for (int i = 0; i < count; ++i) { + cpu_fp32_output_[0][i] = ::powf(base_num_, start_num_ + step * i); + } + }; break; + case MLUOP_DTYPE_HALF: { + half step = + ((half)end_num_ - (half)start_num_) / (half)((half)steps_num_ - 1); + int halfway = steps_num_ / 2; + for (int i = 0; i < count; ++i) { + if (i < halfway) { + cpu_fp32_output_[0][i] = + (half)::pow((half)base_num_, (half)start_num_ + step * i); + } else { + cpu_fp32_output_[0][i] = (half)::pow( + (half)base_num_, (half)end_num_ - step * (steps_num_ - i - 1)); + } + } + }; break; + case MLUOP_DTYPE_INT32: { + for (int i = 0; i < count; ++i) { + cpu_fp32_output_[0][i] = + (int)::powf(base_num_, start_num_ + step * i); + } + }; break; + default: + break; + } + } +} + +int64_t LogspaceExecutor::getTheoryOps() { + int64_t theory_ops = parser_->output(0)->total_count; + VLOG(4) << "getTheoryOps: " << theory_ops << " ops"; + return theory_ops; +} + +} // namespace mluoptest diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/logspace/logspace.h b/test/mlu_op_gtest/pb_gtest/src/zoo/logspace/logspace.h new file mode 100644 index 000000000..7ba562bfb --- /dev/null +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/logspace/logspace.h @@ -0,0 +1,50 @@ +/************************************************************************* + * Copyright (C) [2024] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#ifndef TEST_MLU_OP_GTEST_SRC_ZOO_LOGSPACE_LOGSPACE_H_ +#define TEST_MLU_OP_GTEST_SRC_ZOO_LOGSPACE_LOGSPACE_H_ +#include "executor.h" + +namespace mluoptest { + +class LogspaceExecutor : public Executor { + public: + LogspaceExecutor() {} + ~LogspaceExecutor() {} + + void paramCheck(); + void compute(); + void cpuCompute(); + int64_t getTheoryOps() override; + + private: + void initData(); + + float start_num_; + float end_num_; + int steps_num_; + float base_num_; +}; + +} // namespace mluoptest + +#endif // TEST_MLU_OP_GTEST_SRC_ZOO_LOGSPACE_LOGSPACE_H_ //NOLINT \ No newline at end of file diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/logspace/test_case/logspace_data_included_float32_1725532639524.prototxt b/test/mlu_op_gtest/pb_gtest/src/zoo/logspace/test_case/logspace_data_included_float32_1725532639524.prototxt new file mode 100644 index 000000000..aa2b6f08f --- /dev/null +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/logspace/test_case/logspace_data_included_float32_1725532639524.prototxt @@ -0,0 +1,163 @@ +device: GPU +op_name: "logspace" +input { + id: "NULL" + shape { + dims: 0 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT +} +output { + id: "output1" + shape { + dims: 128 + } + layout: LAYOUT_ARRAY + dtype: DTYPE_FLOAT + value_i: 1092616192 + value_i: 1093003396 + value_i: 1093404898 + value_i: 1093821226 + value_i: 1094252928 + value_i: 1094700571 + value_i: 1095164744 + value_i: 1095646057 + value_i: 1096145144 + value_i: 1096662660 + value_i: 1097199286 + value_i: 1097755728 + value_i: 1098332718 + value_i: 1098919331 + value_i: 1099229525 + value_i: 1099551174 + value_i: 1099884701 + value_i: 1100230543 + value_i: 1100589156 + value_i: 1100961012 + value_i: 1101346599 + value_i: 1101746424 + value_i: 1102161014 + value_i: 1102590913 + value_i: 1103036686 + value_i: 1103498921 + value_i: 1103978224 + value_i: 1104475227 + value_i: 1104990582 + value_i: 1105524967 + value_i: 1106079085 + value_i: 1106653666 + value_i: 1107249463 + value_i: 1107581759 + value_i: 1107902064 + value_i: 1108234198 + value_i: 1108578596 + value_i: 1108935712 + value_i: 1109306014 + value_i: 1109689991 + value_i: 1110088147 + value_i: 1110501005 + value_i: 1110929109 + value_i: 1111373021 + value_i: 1111833325 + value_i: 1112310627 + value_i: 1112805554 + value_i: 1113318757 + value_i: 1113850911 + value_i: 1114402715 + value_i: 1114974896 + value_i: 1115568205 + value_i: 1115934144 + value_i: 1116253112 + value_i: 1116583859 + value_i: 1116926819 + value_i: 1117282443 + value_i: 1117651199 + value_i: 1118033572 + value_i: 1118430065 + value_i: 1118841200 + value_i: 1119267516 + value_i: 1119709574 + value_i: 1120167956 + value_i: 1120643265 + value_i: 1121136125 + value_i: 1121647185 + value_i: 1122177116 + value_i: 1122726616 + value_i: 1123296408 + value_i: 1123887239 + value_i: 1124286680 + value_i: 1124604317 + value_i: 1124933682 + value_i: 1125275210 + value_i: 1125629349 + value_i: 1125996565 + value_i: 1126377342 + value_i: 1126772179 + value_i: 1127181596 + value_i: 1127606132 + value_i: 1128046344 + value_i: 1128502812 + value_i: 1128976136 + value_i: 1129466938 + value_i: 1129975864 + value_i: 1130503582 + value_i: 1131050788 + value_i: 1131618200 + value_i: 1132206564 + value_i: 1132639367 + value_i: 1132955677 + value_i: 1133283667 + value_i: 1133623769 + value_i: 1133976429 + value_i: 1134342112 + value_i: 1134721298 + value_i: 1135114487 + value_i: 1135522194 + value_i: 1135944957 + value_i: 1136383331 + value_i: 1136837893 + value_i: 1137309240 + value_i: 1137797992 + value_i: 1138304793 + value_i: 1138830308 + value_i: 1139375228 + value_i: 1139940270 + value_i: 1140526178 + value_i: 1140992205 + value_i: 1141307193 + value_i: 1141633814 + value_i: 1141972495 + value_i: 1142323683 + value_i: 1142687838 + value_i: 1143065441 + value_i: 1143456988 + value_i: 1143862993 + value_i: 1144283990 + value_i: 1144720534 + value_i: 1145173197 + value_i: 1145642576 + value_i: 1146129287 + value_i: 1146633971 + value_i: 1147157292 + value_i: 1147699936 + value_i: 1148262619 + value_i: 1148846080 + thresholds { + evaluation_threshold: 1e-05 + evaluation_threshold: 1e-05 + evaluation_threshold_imag: -1 + evaluation_threshold_imag: -1 + } +} +evaluation_criterion: DIFF1 +evaluation_criterion: DIFF2 +handle_param { + round_mode: ROUND_OFF_ZERO +} +logspace_param { + start: 1 + end: 3 + steps: 128 + base: 10 +} diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp index ca28c192e..d605d83b4 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp @@ -138,11 +138,15 @@ int64_t MoeDispatchBackwardDataExecutor::getTheoryOps() { } int64_t MoeDispatchBackwardDataExecutor::getTheoryIoSize() { - auto gates_dwidth = mluop::getSizeOfDataType(desc_gates_->dtype); - auto indices_dwidth = mluop::getSizeOfDataType(desc_indices_->dtype); - auto locations_dwidth = mluop::getSizeOfDataType(desc_locations_->dtype); - auto dispatch_dwidth = mluop::getSizeOfDataType(desc_dispatch_->dtype); - auto grad_input_dwidth = mluop::getSizeOfDataType(desc_grad_input_->dtype); + size_t gates_dwidth, indices_dwidth, locations_dwidth, dispatch_dwidth, + grad_input_dwidth; + MLUOP_CHECK(mluOpGetSizeOfDataType(desc_gates_->dtype, &gates_dwidth)); + MLUOP_CHECK(mluOpGetSizeOfDataType(desc_indices_->dtype, &indices_dwidth)); + MLUOP_CHECK( + mluOpGetSizeOfDataType(desc_locations_->dtype, &locations_dwidth)); + MLUOP_CHECK(mluOpGetSizeOfDataType(desc_dispatch_->dtype, &dispatch_dwidth)); + MLUOP_CHECK( + mluOpGetSizeOfDataType(desc_grad_input_->dtype, &grad_input_dwidth)); int64_t gates_theory_ios = samples_mask_num_ * gates_dwidth; int64_t indices_theory_ios = samples_mask_num_ * indices_dwidth; diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/moe_dispatch_backward_gate/moe_dispatch_backward_gate.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/moe_dispatch_backward_gate/moe_dispatch_backward_gate.cpp index 01370dede..87b61a955 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/moe_dispatch_backward_gate/moe_dispatch_backward_gate.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/moe_dispatch_backward_gate/moe_dispatch_backward_gate.cpp @@ -65,7 +65,7 @@ void MoeDispatchBackwardGateExecutor::workspaceMalloc() { void MoeDispatchBackwardGateExecutor::workspaceFree() { if (workspace_size_ > 0) { VLOG(4) << "[MoeDispatchBackwardGateExecutor] Free device workspace space."; - GTEST_CHECK(CNRT_RET_SUCCESS == mlu_runtime_.deallocate(workspace_[0])); + GTEST_CHECK(cnrtSuccess == mlu_runtime_.deallocate(workspace_[0])); workspace_[0] = nullptr; } } diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/moe_dispatch_forward/moe_dispatch_forward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/moe_dispatch_forward/moe_dispatch_forward.cpp index 6f5886c11..cea466c02 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/moe_dispatch_forward/moe_dispatch_forward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/moe_dispatch_forward/moe_dispatch_forward.cpp @@ -28,22 +28,17 @@ void MoeDispatchForwardExecutor::paramCheck() { if (!parser_->getProtoNode()->has_moe_dispatch_forward_param()) { LOG(ERROR) << "Lose moe_dispatch_forward_param."; } - GTEST_CHECK( - parser_->inputs().size() == 5, - "[MoeDispatchForwardExecutor] tensor input number is wrong."); - GTEST_CHECK( - parser_->outputs().size() == 1, - "[MoeDispatchForwardExecutor] tensor output number is wrong."); + GTEST_CHECK(parser_->inputs().size() == 5, + "[MoeDispatchForwardExecutor] tensor input number is wrong."); + GTEST_CHECK(parser_->outputs().size() == 1, + "[MoeDispatchForwardExecutor] tensor output number is wrong."); flag_input_reuse_ = true; } void MoeDispatchForwardExecutor::initData() { - samples_ = - parser_->getProtoNode()->moe_dispatch_forward_param().samples(); - capacity_ = - parser_->getProtoNode()->moe_dispatch_forward_param().capacity(); - hidden_ = - parser_->getProtoNode()->moe_dispatch_forward_param().hidden(); + samples_ = parser_->getProtoNode()->moe_dispatch_forward_param().samples(); + capacity_ = parser_->getProtoNode()->moe_dispatch_forward_param().capacity(); + hidden_ = parser_->getProtoNode()->moe_dispatch_forward_param().hidden(); num_experts_ = parser_->getProtoNode()->moe_dispatch_forward_param().num_experts(); } @@ -96,14 +91,14 @@ void MoeDispatchForwardExecutor::cpuCompute() { output[i] = dispatch[i]; } for (int i = 0; i < samples_; ++i) { - if (locations[i] >= 0 && locations[i] < capacity_ && - indices[i] >= 0 && indices[i] < num_experts_) { - for (int j = 0; j < hidden_; ++j) { - int idx = ((int)indices[i] * capacity_ + - (int)locations[i]) * (hidden_) + j; - output[idx] = gates[i] * input[i * (hidden_) + j]; - } - } + if (locations[i] >= 0 && locations[i] < capacity_ && indices[i] >= 0 && + indices[i] < num_experts_) { + for (int j = 0; j < hidden_; ++j) { + int idx = + ((int)indices[i] * capacity_ + (int)locations[i]) * (hidden_) + j; + output[idx] = gates[i] * input[i * (hidden_) + j]; + } + } } VLOG(4) << "[MoeDispatchForwardExecutor] call cpuCompute() end."; @@ -118,25 +113,28 @@ int64_t MoeDispatchForwardExecutor::getTheoryOps() { } int64_t MoeDispatchForwardExecutor::getTheoryIoSize() { - auto gates_dwidth = mluop::getSizeOfDataType(desc_gates_->dtype); - auto indices_dwidth = mluop::getSizeOfDataType(desc_indices_->dtype); - auto locations_dwidth = mluop::getSizeOfDataType(desc_locations_->dtype); - auto input_dwidth = mluop::getSizeOfDataType(desc_input_->dtype); - auto dispatch_dwidth = mluop::getSizeOfDataType(desc_input_->dtype); + size_t gates_dwidth, indices_dwidth, locations_dwidth, input_dwidth, + dispatch_dwidth; + MLUOP_CHECK(mluOpGetSizeOfDataType(desc_gates_->dtype, &gates_dwidth)); + MLUOP_CHECK(mluOpGetSizeOfDataType(desc_indices_->dtype, &indices_dwidth)); + MLUOP_CHECK( + mluOpGetSizeOfDataType(desc_locations_->dtype, &locations_dwidth)); + MLUOP_CHECK(mluOpGetSizeOfDataType(desc_input_->dtype, &input_dwidth)); + MLUOP_CHECK(mluOpGetSizeOfDataType(desc_input_->dtype, &dispatch_dwidth)); int64_t gates_theory_ios = samples_ * gates_dwidth; int64_t indices_theory_ios = samples_ * indices_dwidth; int64_t locations_theory_ios = samples_ * locations_dwidth; int64_t input_theory_ios = samples_ * hidden_ * input_dwidth; - int64_t dispatch_theory_ios = num_experts_ * capacity_ * hidden_ * - dispatch_dwidth; + int64_t dispatch_theory_ios = + num_experts_ * capacity_ * hidden_ * dispatch_dwidth; int64_t theory_ios = gates_theory_ios + indices_theory_ios + locations_theory_ios + input_theory_ios + dispatch_theory_ios; - VLOG(4) << "MoeDispatchForwardExecutor::getTheoryIoSize() : " - << theory_ios << " IoSize"; + VLOG(4) << "MoeDispatchForwardExecutor::getTheoryIoSize() : " << theory_ios + << " IoSize"; return theory_ios; } diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/nms/nms.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/nms/nms.cpp index 39cb6809b..0eb97444e 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/nms/nms.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/nms/nms.cpp @@ -80,12 +80,12 @@ void NmsExecutor::workspaceMalloc() { parser_->getMetaTensor("input1").size_in_bytes == 0) && output_size > 0) { void *output_ptr = parser_->getMetaTensor("output1").dev_origin_ptr; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMemset(output_ptr, 0, output_size)); + GTEST_CHECK(cnrtSuccess == cnrtMemset(output_ptr, 0, output_size)); } void *output2_ptr = parser_->getMetaTensor("output2").dev_origin_ptr; size_t output2_size = parser_->getMetaTensor("output2").size_in_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMemset(output2_ptr, 0, output2_size)); + GTEST_CHECK(cnrtSuccess == cnrtMemset(output2_ptr, 0, output2_size)); eva_->setMluWorkspaceSize(workspace_size_); } @@ -411,7 +411,7 @@ void NmsExecutor::nms_detection_cpu( } void NmsExecutor::cpuCompute() { - assert(parser_->getInputNum() == 2); + GTEST_CHECK(parser_->getInputNum() == 2); // assert(parser_->getOutputNum() == 1); int max_output_boxes = diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/nms_rotated/nms_rotated.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/nms_rotated/nms_rotated.cpp index 958fb5cc1..8fc555ecd 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/nms_rotated/nms_rotated.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/nms_rotated/nms_rotated.cpp @@ -101,10 +101,10 @@ void NmsRotatedExecutor::compute() { auto result_num = data_vector_[3].device_ptr; size_t workspace_size = 0; size_t output_size = parser_->getMetaTensor("output1").size_in_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMemset(dev_output, 0, output_size)); - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMemset(result_num, 0, sizeof(int32_t))); + GTEST_CHECK(cnrtSuccess == cnrtMemset(dev_output, 0, output_size)); + GTEST_CHECK(cnrtSuccess == cnrtMemset(result_num, 0, sizeof(int32_t))); - // GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMemset(dev_output, 0, + // GTEST_CHECK(cnrtSuccess == cnrtMemset(dev_output, 0, // output->dims[0] * sizeof(int64_t))); VLOG(4) << "call mluOpNmsRotated()"; interface_timer_.start(); @@ -359,7 +359,7 @@ template int NmsRotatedExecutor::convexHullGraham(const Point (&p)[24], const int &num_in, Point (&q)[24]) { - assert(num_in >= 2); + GTEST_CHECK(num_in >= 2); // Step1: // Find point with minimum y // if more than 1 points have the same minimum y, diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/poly_nms/poly_nms.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/poly_nms/poly_nms.cpp index 199f06980..01dd4acd5 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/poly_nms/poly_nms.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/poly_nms/poly_nms.cpp @@ -54,13 +54,13 @@ void PolyNmsExecutor::workspaceMalloc() { auto output_tensor = parser_->getMetaTensor("output1").tensor; void *output_ptr = parser_->getMetaTensor("output1").dev_origin_ptr; size_t output_size = parser_->getMetaTensor("output1").size_in_bytes; - GTEST_CHECK(CNRT_RET_SUCCESS == cnrtMemset(output_ptr, 0, output_size)); + GTEST_CHECK(cnrtSuccess == cnrtMemset(output_ptr, 0, output_size)); } void PolyNmsExecutor::workspaceFree() { if (workspace_[0]) { VLOG(4) << "Free device workspace space."; - GTEST_CHECK(CNRT_RET_SUCCESS == mlu_runtime_.deallocate(workspace_[0])); + GTEST_CHECK(cnrtSuccess == mlu_runtime_.deallocate(workspace_[0])); workspace_[0] = nullptr; } } diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/roi_align_backward/roi_align_backward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/roi_align_backward/roi_align_backward.cpp index ceebe53aa..86cc8aa9e 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/roi_align_backward/roi_align_backward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/roi_align_backward/roi_align_backward.cpp @@ -32,12 +32,10 @@ void RoiAlignBackwardExecutor::paramCheck() { GTEST_CHECK(parser_->getProtoNode()->has_roi_align_backward_param(), "mluOpRoiAlignBackward: lose param. "); - GTEST_CHECK( - parser_->getInputNum() == 2 || parser_->getInputNum() == 4, - "mluOpRoiAlignBackward: tensor input number is wrong."); - GTEST_CHECK( - parser_->getOutputNum() == 1, - "mluOpRoiAlignBackward: tensor output number is wrong."); + GTEST_CHECK(parser_->getInputNum() == 2 || parser_->getInputNum() == 4, + "mluOpRoiAlignBackward: tensor input number is wrong."); + GTEST_CHECK(parser_->getOutputNum() == 1, + "mluOpRoiAlignBackward: tensor output number is wrong."); } void RoiAlignBackwardExecutor::compute() { @@ -316,8 +314,21 @@ void RoiAlignBackwardExecutor::cpuCompute() { int64_t RoiAlignBackwardExecutor::getTheoryOps() { int64_t theory_ops = 0; - - auto boxes = parser_->getMetaTensor(1).cpu_ptr; + float *host_boxes = nullptr; + Device device = parser_->device(); + if (device != Device::CPU) { + auto boxes_desc = tensor_desc_[1].tensor; + auto boxes_dtype = boxes_desc->dtype; + size_t boxes_num = parser_->getInputDataCount(1); + float *boxes_ptr = + (float *)cpu_runtime_.allocate(boxes_num * sizeof(float)); + castDataOut(data_vector_[1].host_ptr, boxes_dtype, (float *)boxes_ptr, + MLUOP_DTYPE_FLOAT, boxes_num, NO_QUANT, 0, 1, 0); + host_boxes = boxes_ptr; + } else { + host_boxes = cpu_fp32_input_[1]; + } + auto boxes = host_boxes; auto input_desc = parser_->getMetaTensor(0).tensor; float spatial_scale = parser_->getProtoNode()->roi_align_backward_param().spatial_scale(); diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/roi_pooling_backward/roi_pooling_backward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/roi_pooling_backward/roi_pooling_backward.cpp index 17e7da197..df34ac217 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/roi_pooling_backward/roi_pooling_backward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/roi_pooling_backward/roi_pooling_backward.cpp @@ -47,9 +47,9 @@ void RoiPoolingBackwardExecutor::compute() { auto argmax_desc = parser_->getMetaTensor(2).tensor; auto grads_image_desc = parser_->getMetaTensor(3).tensor; float spatial_scale = - parser_->getProtoNode()->roi_pooling_backward_param().spatial_scale(); + parser_->getProtoNode()->roi_pooling_backward_param().spatial_scale(); PoolingForwardMode mode = - parser_->getProtoNode()->roi_pooling_backward_param().mode(); + parser_->getProtoNode()->roi_pooling_backward_param().mode(); mluOpPoolingMode_t cmode; if (mode == POOLING_MAX) { cmode = MLUOP_POOLING_MAX; @@ -57,9 +57,9 @@ void RoiPoolingBackwardExecutor::compute() { VLOG(4) << "call mluop mluOpRoiPoolingBackward()"; interface_timer_.start(); - MLUOP_CHECK(mluOpRoiPoolingBackward(handle_, cmode, grads_desc, grads, - rois_desc, rois, argmax_desc, (int *)argmax, - spatial_scale, grads_image_desc, grads_image)); + MLUOP_CHECK(mluOpRoiPoolingBackward( + handle_, cmode, grads_desc, grads, rois_desc, rois, argmax_desc, + (int *)argmax, spatial_scale, grads_image_desc, grads_image)); interface_timer_.stop(); } @@ -74,9 +74,9 @@ void RoiPoolingBackwardExecutor::cpuCompute() { auto argmax_desc = parser_->getMetaTensor(2).tensor; auto grads_image_desc = parser_->getMetaTensor(3).tensor; float spatial_scale = - parser_->getProtoNode()->roi_pooling_backward_param().spatial_scale(); + parser_->getProtoNode()->roi_pooling_backward_param().spatial_scale(); PoolingForwardMode mode = - parser_->getProtoNode()->roi_pooling_backward_param().mode(); + parser_->getProtoNode()->roi_pooling_backward_param().mode(); size_t grads_n = grads_desc->dims[0]; size_t grads_h = grads_desc->dims[1]; @@ -101,9 +101,9 @@ void RoiPoolingBackwardExecutor::cpuCompute() { const int pooled_width = grads_w; const int num_rois = argmax_n; - auto transData = [&](float *old_data, float *new_data, - TensorLayout old_order, TensorLayout new_order, - int n, int c, int d, int h, int w) { + auto transData = [&](float *old_data, float *new_data, TensorLayout old_order, + TensorLayout new_order, int n, int c, int d, int h, + int w) { if (old_data == nullptr || new_data == nullptr) { LOG(ERROR) << "data address do not malloc in cpu compute."; return; @@ -115,9 +115,10 @@ void RoiPoolingBackwardExecutor::cpuCompute() { for (int dd = 0; dd < d; dd++) { for (int hh = 0; hh < h; hh++) { for (int ww = 0; ww < w; ww++) { - new_data[nn * c * d * h * w + cc * d * h * w + dd * h * w - + hh * w + ww] = old_data[nn * d * h * w * c + - dd * h * w * c + hh * w * c + ww * c + cc]; + new_data[nn * c * d * h * w + cc * d * h * w + dd * h * w + + hh * w + ww] = + old_data[nn * d * h * w * c + dd * h * w * c + hh * w * c + + ww * c + cc]; } } } @@ -129,9 +130,10 @@ void RoiPoolingBackwardExecutor::cpuCompute() { for (int dd = 0; dd < d; dd++) { for (int hh = 0; hh < h; hh++) { for (int ww = 0; ww < w; ww++) { - new_data[nn * c * d * h * w + dd * h * w * c + hh * w * c - + ww * c + cc] = old_data[nn * d * h * w * c + - cc * d * h * w + dd * h * w + hh * w + ww]; + new_data[nn * c * d * h * w + dd * h * w * c + hh * w * c + + ww * c + cc] = + old_data[nn * d * h * w * c + cc * d * h * w + dd * h * w + + hh * w + ww]; } } } @@ -140,26 +142,27 @@ void RoiPoolingBackwardExecutor::cpuCompute() { } }; - float *top_diff = (float *)cpu_runtime_.allocate(grads_n * 1 * grads_c * - grads_h * grads_w * sizeof(float)); - transData(grads, top_diff, LAYOUT_NDHWC, LAYOUT_NCDHW, grads_n, - grads_c, 1, grads_h, grads_w); - float *argmax_data = (float *)cpu_runtime_.allocate(argmax_n * 1 * - argmax_c * argmax_h * argmax_w * sizeof(float)); - transData(argmax, argmax_data, LAYOUT_NDHWC, LAYOUT_NCDHW, argmax_n, - argmax_c, 1, argmax_h, argmax_w); - float *bottom_diff = (float *)cpu_runtime_.allocate(grads_image_n * 1 * - grads_image_c * grads_image_h * grads_image_w * sizeof(float)); + float *top_diff = (float *)cpu_runtime_.allocate( + grads_n * 1 * grads_c * grads_h * grads_w * sizeof(float)); + transData(grads, top_diff, LAYOUT_NDHWC, LAYOUT_NCDHW, grads_n, grads_c, 1, + grads_h, grads_w); + float *argmax_data = (float *)cpu_runtime_.allocate( + argmax_n * 1 * argmax_c * argmax_h * argmax_w * sizeof(float)); + transData(argmax, argmax_data, LAYOUT_NDHWC, LAYOUT_NCDHW, argmax_n, argmax_c, + 1, argmax_h, argmax_w); + float *bottom_diff = (float *)cpu_runtime_.allocate( + grads_image_n * 1 * grads_image_c * grads_image_h * grads_image_w * + sizeof(float)); for (int i = 0; i < batch_size * channels * height * width; i++) { *((float *)bottom_diff + i) = 0; } - for (int ind = 0; ind < num_rois * channels * - pooled_height * pooled_width; ind++) { + for (int ind = 0; ind < num_rois * channels * pooled_height * pooled_width; + ind++) { int pw = ind % pooled_width; int ph = (ind / pooled_width) % pooled_height; int c = (ind / pooled_width / pooled_height) % channels; - int n = ind / pooled_width / pooled_height / channels; + int n = ind / pooled_width / pooled_height / channels; const float *offset_bottom_rois = rois + n * 5; int roi_batch_ind = offset_bottom_rois[0]; int bottom_offset = (roi_batch_ind * channels + c) * height * width; @@ -175,8 +178,8 @@ void RoiPoolingBackwardExecutor::cpuCompute() { } } - transData(bottom_diff, grads_image, LAYOUT_NCDHW, LAYOUT_NDHWC, - grads_image_n, grads_image_c, 1, grads_image_h, grads_image_w); + transData(bottom_diff, grads_image, LAYOUT_NCDHW, LAYOUT_NDHWC, grads_image_n, + grads_image_c, 1, grads_image_h, grads_image_w); cpu_runtime_.deallocate(bottom_diff); cpu_runtime_.deallocate(top_diff); cpu_runtime_.deallocate(argmax_data); @@ -185,12 +188,23 @@ void RoiPoolingBackwardExecutor::cpuCompute() { int64_t RoiPoolingBackwardExecutor::getTheoryOps() { int64_t theory_ops = 0; - auto argmax = parser_->getMetaTensor(2).cpu_ptr; + float *host_argmax = nullptr; + Device device = parser_->device(); + if (device != Device::CPU) { + auto argmax_desc = tensor_desc_[2].tensor; + auto argmax_dtype = argmax_desc->dtype; + size_t argmax_num = parser_->getInputDataCount(2); + float *argmax = (float *)cpu_runtime_.allocate(argmax_num * sizeof(float)); + castDataOut(data_vector_[2].host_ptr, argmax_dtype, (float *)argmax, + MLUOP_DTYPE_FLOAT, argmax_num, NO_QUANT, 0, 1, 0); + host_argmax = argmax; + } else { + host_argmax = cpu_fp32_input_[2]; + } + auto argmax = host_argmax; + auto grads_desc = parser_->getMetaTensor(0).tensor; - auto rois_desc = parser_->getMetaTensor(1).tensor; - auto argmax_desc = parser_->getMetaTensor(2).tensor; auto grads_image_desc = parser_->getMetaTensor(3).tensor; - size_t grads_n = grads_desc->dims[0]; size_t grads_h = grads_desc->dims[1]; size_t grads_w = grads_desc->dims[2]; @@ -201,18 +215,18 @@ int64_t RoiPoolingBackwardExecutor::getTheoryOps() { size_t grads_image_c = grads_image_desc->dims[3]; theory_ops += grads_image_n * grads_image_h * grads_image_w * grads_image_c; - for (int i = 0; i < grads_n * grads_h * grads_w * grads_c; i++) { + for (size_t i = 0; i < grads_n * grads_h * grads_w * grads_c; i++) { if (argmax[i] != -1) { theory_ops++; } } - VLOG(4) << "getTheoryOps: " << theory_ops << "ops."; + VLOG(4) << "getTheoryOps: " << theory_ops << " ops"; return theory_ops; } -std::set RoiPoolingBackwardExecutor:: - getCriterionsUse() const { - return { Evaluator::DIFF1, Evaluator::DIFF2, Evaluator::DIFF4 }; +std::set RoiPoolingBackwardExecutor::getCriterionsUse() + const { + return {Evaluator::DIFF1, Evaluator::DIFF2, Evaluator::DIFF4}; } } // namespace mluoptest diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/sqrt/sqrt.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/sqrt/sqrt.cpp index feec26a7f..18c0db1be 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/sqrt/sqrt.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/sqrt/sqrt.cpp @@ -48,8 +48,8 @@ void SqrtExecutor::compute() { } void SqrtExecutor::cpuCompute() { - assert(parser_->getInputNum() == 1); - assert(parser_->getOutputNum() == 1); + GTEST_CHECK(parser_->getInputNum() == 1); + GTEST_CHECK(parser_->getOutputNum() == 1); auto count1 = parser_->getInputDataCount(0); auto count2 = parser_->getOutputDataCount(0); diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/sqrt_backward/sqrt_backward.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/sqrt_backward/sqrt_backward.cpp index c2e8b9aed..ff8077d52 100644 --- a/test/mlu_op_gtest/pb_gtest/src/zoo/sqrt_backward/sqrt_backward.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/zoo/sqrt_backward/sqrt_backward.cpp @@ -46,12 +46,12 @@ void SqrtBackwardExecutor::compute() { } void SqrtBackwardExecutor::cpuCompute() { - assert(parser_->getInputNum() == 2); - assert(parser_->getOutputNum() == 1); + GTEST_CHECK(parser_->getInputNum() == 2); + GTEST_CHECK(parser_->getOutputNum() == 1); auto count1 = parser_->getInputDataCount(0); auto count2 = parser_->getInputDataCount(1); - assert(count1 == count2); + GTEST_CHECK(count1 == count2); for (size_t i = 0; i < count1; ++i) { cpu_fp32_output_[0][i] = diff --git a/test/mlu_op_gtest/pb_gtest/tests/cnrt_test.h b/test/mlu_op_gtest/pb_gtest/tests/cnrt_test.h index bf9dac87d..31d083773 100644 --- a/test/mlu_op_gtest/pb_gtest/tests/cnrt_test.h +++ b/test/mlu_op_gtest/pb_gtest/tests/cnrt_test.h @@ -40,7 +40,7 @@ // // int dev = 0; // not ptr, it's long unsigned int // unsigned int dev_num = 0; -// ASSERT_TRUE(CNRT_RET_SUCCESS == cnrtGetDeviceCount(&dev_num)); +// ASSERT_TRUE(cnrtSuccess == cnrtGetDeviceCount(&dev_num)); // ASSERT_GT(dev_num, 0); // dev_num > 0 // // ASSERT_TRUE(cnrtSuccess == cnrtGetDevice(&dev)); // use device: 0 @@ -83,7 +83,7 @@ TEST(DISABLED_CNRT, cnrtMemGetInfo) { unsigned int dev_num = 0; - ASSERT_TRUE(CNRT_RET_SUCCESS == cnrtGetDeviceCount(&dev_num)); + ASSERT_TRUE(cnrtSuccess == cnrtGetDeviceCount(&dev_num)); ASSERT_GT(dev_num, 0); // dev_num > 0 int dev_id; @@ -100,18 +100,18 @@ TEST(DISABLED_CNRT, cnrtMemGetInfo) { // example, // this test is useless -// test cnrtNotifier: +// test cnrtNotifier_t: // 1.create notifier // 2.place notifier // 3.get duration // 4.destroy notifier // multi-thread create and place notifier to 1 queue. // and multi-thread destroy these notifier -TEST(DISABLED_CNRT, cnrtNotifier) { +TEST(DISABLED_CNRT, cnrtNotifier_t) { const size_t thread_num = 4; unsigned int dev_num = 0; - ASSERT_TRUE(CNRT_RET_SUCCESS == cnrtGetDeviceCount(&dev_num)); + ASSERT_TRUE(cnrtSuccess == cnrtGetDeviceCount(&dev_num)); ASSERT_GT(dev_num, 0); int dev_id; @@ -130,15 +130,15 @@ TEST(DISABLED_CNRT, cnrtNotifier) { auto task_part1 = [&queue, &ctxs](int idx) { ASSERT_TRUE(cnrtSuccess == cnrtNotifierCreate(&(ctxs.at(idx).na))); ASSERT_TRUE(cnrtSuccess == cnrtNotifierCreate(&(ctxs.at(idx).nb))); - ASSERT_TRUE(CNRT_RET_SUCCESS == cnrtPlaceNotifier(ctxs.at(idx).na, queue)); - ASSERT_TRUE(CNRT_RET_SUCCESS == cnrtPlaceNotifier(ctxs.at(idx).nb, queue)); + ASSERT_TRUE(cnrtSuccess == cnrtPlaceNotifier(ctxs.at(idx).na, queue)); + ASSERT_TRUE(cnrtSuccess == cnrtPlaceNotifier(ctxs.at(idx).nb, queue)); }; auto task_part2 = [&queue, &ctxs](int idx) { ASSERT_TRUE(cnrtSuccess == cnrtQueueSync(queue)); float hwt = -1.0f; - ASSERT_TRUE(CNRT_RET_SUCCESS == + ASSERT_TRUE(cnrtSuccess == cnrtNotifierDuration(ctxs.at(idx).na, ctxs.at(idx).nb, &hwt)); ASSERT_EQ(0.0f, hwt); diff --git a/tools/coverage.sh b/tools/coverage.sh index 4015581d1..8d01aa13f 100755 --- a/tools/coverage.sh +++ b/tools/coverage.sh @@ -83,9 +83,11 @@ function process () { # run gtest readonly mluops_dir=$(dirname ${lib_path_}) export LD_LIBRARY_PATH="${mluops_dir}":$LD_LIBRARY_PATH - export CNRT_DUMP_PGO=1 + export CNRT_DUMP_PGO=1 # Will be removed + export CN_DUMP_PGO=1 mkdir -p ${temp_dir_} - export CNRT_PGO_OUTPUT_DIR=${temp_dir_}/output + export CNRT_PGO_OUTPUT_DIR=${temp_dir_}/output # Will be removed + export CN_PGO_OUTPUT_DIR=${temp_dir_}/output export LLVM_PROFILE_FILE=${temp_dir_}/output/host.profraw ${test_cmd_} if [[ ! -z ${extra_test_dir_} ]]; then diff --git a/tools/pre-commit b/tools/pre-commit index 41aebdbe8..1d0a1afe1 100755 --- a/tools/pre-commit +++ b/tools/pre-commit @@ -69,7 +69,7 @@ for i in ${filenames} ; do #not_in_tools=$(echo $i | sed -r "s/^tools.*//") if [[ -n "${include_mlu}" ]] && [[ -n "${check_dir}" ]]; then printf_log=$(sed -n -e '/\/=' -e \ - '/\<__bang_printf\>/=' -e '/\/=' -e '/assert(/=' $i) + '/\<__bang_printf\>/=' -e '/\/=' -e '/\bassert(/=' $i) for line in ${printf_log}; do echo $i +${line}