diff --git a/.github/workflows/bangcops_release_test.yaml b/.github/workflows/bangc_all_system_ci.yaml similarity index 60% rename from .github/workflows/bangcops_release_test.yaml rename to .github/workflows/bangc_all_system_ci.yaml index c84b5194e..cfa599c31 100644 --- a/.github/workflows/bangcops_release_test.yaml +++ b/.github/workflows/bangc_all_system_ci.yaml @@ -1,44 +1,54 @@ -name: bangcops_release_test +name: bangc_all_system_test on: push: + branches: [master, r*] + paths: + - 'bangc-ops/kernels/kernel_wrapper/**' + - 'bangc-ops/CMakeLists.txt' + - 'bangc-ops/independent_build.sh' tags: - v* + pull_request: + paths: + - '.github/workflows/bangc_all_system_ci.yaml' jobs: test: strategy: matrix: runner: [mlu270-x5k, mlu290-m5, mlu370-m8] - os: [ubuntu18.04, ubuntu20.04, debian9, debian10, centos7, centos8] + mlu_ops_version : [v0.4.2] + cntoolkit_version : [cntoolkit3.0.2] + os: [ubuntu18.04, ubuntu20.04, debian10, centos7, centos8] runs-on: ${{matrix.runner}} steps: - uses: actions/checkout@v3 - name: pull_images run: | - docker pull docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:v0.4.0-devel-x86_64-${{matrix.os}} + docker pull docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:${{matrix.mlu_ops_version}}-devel-x86_64-${{matrix.os}}-${{matrix.cntoolkit_version}} - name: build_bangc_ops run: > - docker run --rm -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:v0.4.0-devel-x86_64-${{matrix.os}} + docker run --rm -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:${{matrix.mlu_ops_version}}-devel-x86_64-${{matrix.os}}-${{matrix.cntoolkit_version}} ./build.sh --sub_module=bangc - name: mlu_ops_version_check run: > - docker run --rm -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:v0.4.0-devel-x86_64-${{matrix.os}} - bash version_check.sh 0.4.1 + docker run --rm -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:${{matrix.mlu_ops_version}}-devel-x86_64-${{matrix.os}}-${{matrix.cntoolkit_version}} + bash version_check.sh 0.4.2 - name: bangc_ops_release_test_cases run: > docker run --rm --device /dev/cambricon_ctl --device /dev/cambricon_dev0 --device /dev/commu0 - -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:v0.4.0-devel-x86_64-${{matrix.os}} + -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:${{matrix.mlu_ops_version}}-devel-x86_64-${{matrix.os}}-${{matrix.cntoolkit_version}} ./test.sh --sub_module=bangc --cases_dir=/testdata/release_test/default_platform - name: bangc_ops_release_temp_cases run: > docker run --rm --device /dev/cambricon_ctl --device /dev/cambricon_dev0 --device /dev/commu0 - -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:v0.4.0-devel-x86_64-${{matrix.os}} + -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:${{matrix.mlu_ops_version}}-devel-x86_64-${{matrix.os}}-${{matrix.cntoolkit_version}} ./test.sh --sub_module=bangc --cases_dir=/testdata/release_temp/default_platform @@ -46,17 +56,16 @@ jobs: if: matrix.runner == 'mlu370-m8' run: > docker run --rm --device /dev/cambricon_ctl --device /dev/cambricon_dev0 --device /dev/commu0 - -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:v0.4.0-devel-x86_64-${{matrix.os}} + -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:${{matrix.mlu_ops_version}}-devel-x86_64-${{matrix.os}}-${{matrix.cntoolkit_version}} ./test.sh --sub_module=bangc --cases_dir=/testdata/release_test/370 - name: bangc_ops_release_temp_370_cases if: matrix.runner == 'mlu370-m8' run: > docker run --rm --device /dev/cambricon_ctl --device /dev/cambricon_dev0 --device /dev/commu0 - -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:v0.4.0-devel-x86_64-${{matrix.os}} + -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:${{matrix.mlu_ops_version}}-devel-x86_64-${{matrix.os}}-${{matrix.cntoolkit_version}} ./test.sh --sub_module=bangc --cases_dir=/testdata/release_temp/370 - - name: clean run: | rm -rf bangc-ops/build diff --git a/.github/workflows/ci.yaml b/.github/workflows/bangc_ci.yaml similarity index 50% rename from .github/workflows/ci.yaml rename to .github/workflows/bangc_ci.yaml index a7cbaf555..8be04e764 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/bangc_ci.yaml @@ -1,16 +1,50 @@ -name: ci +name: bangc_ci on: push: branches: [master, r*] + paths-ignore: + - 'docs/**' + - 'bangpy-ops/**' + - 'docker/**' + - 'samples/**' + - 'installer/**' + - '.github/ISSUE_TEMPLATE/**' + - '.github/pull_request_template.md' + - 'CONTRIBUTION.md' + - 'CPPLINT.cfg' + - 'LICENSE' + - 'README.md' + - 'bangc-ops/README.md' + - 'requirements.txt' + - '.github/workflows/bangc_all_system_ci.yaml' + - '.github/workflows/daily.yaml' pull_request: branches: [master, r*] + paths-ignore: + - 'docs/**' + - 'bangpy-ops/**' + - 'docker/**' + - 'samples/**' + - 'installer/**' + - '.github/ISSUE_TEMPLATE/**' + - '.github/pull_request_template.md' + - 'CONTRIBUTION.md' + - 'CPPLINT.cfg' + - 'LICENSE' + - 'README.md' + - 'bangc-ops/README.md' + - 'requirements.txt' + - '.github/workflows/bangc_all_system_ci.yaml' + - '.github/workflows/daily.yaml' jobs: test: strategy: matrix: runner: [mlu270-x5k, mlu290-m5, mlu370-m8] + mlu_ops_version : [v0.4.2] + cntoolkit_version : [cntoolkit3.0.2] runs-on: ${{matrix.runner}} steps: - uses: actions/checkout@v3 @@ -21,20 +55,20 @@ jobs: - name: build_bangc_ops run: > - docker run --rm -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:v0.4.0-devel-x86_64-ubuntu18.04 + docker run --rm -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:${{matrix.mlu_ops_version}}-devel-x86_64-ubuntu18.04-${{matrix.cntoolkit_version}} ./build.sh --sub_module=bangc - name: bangc_ops_release_temp_cases run: > docker run --rm --device /dev/cambricon_ctl --device /dev/cambricon_dev0 --device /dev/commu0 - -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:v0.4.0-devel-x86_64-ubuntu18.04 + -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:${{matrix.mlu_ops_version}}-devel-x86_64-ubuntu18.04-${{matrix.cntoolkit_version}} ./test.sh --sub_module=bangc --cases_dir=/testdata/release_temp/default_platform - name: test_bangc_ops_release_temp_370_cases if: matrix.runner == 'mlu370-m8' run: > docker run --rm --device /dev/cambricon_ctl --device /dev/cambricon_dev0 --device /dev/commu0 - -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:v0.4.0-devel-x86_64-ubuntu18.04 + -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:${{matrix.mlu_ops_version}}-devel-x86_64-ubuntu18.04-${{matrix.cntoolkit_version}} ./test.sh --sub_module=bangc --cases_dir=/testdata/release_temp/370 - name: clean diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml index eda4a22d3..85b0a31c2 100644 --- a/.github/workflows/daily.yaml +++ b/.github/workflows/daily.yaml @@ -3,12 +3,17 @@ name: daily on: schedule: - cron: '0 15 * * *' + pull_request: + paths: + - '.github/workflows/daily.yaml' jobs: test: strategy: matrix: runner: [mlu270-x5k, mlu290-m5, mlu370-m8] + mlu_ops_version : [v0.4.2] + cntoolkit_version : [cntoolkit3.0.2] runs-on: ${{matrix.runner}} steps: - uses: actions/checkout@v3 @@ -19,33 +24,33 @@ jobs: - name: build_bangc_ops run: > - docker run --rm -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:v0.4.0-devel-x86_64-ubuntu18.04 + docker run --rm -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:${{matrix.mlu_ops_version}}-devel-x86_64-ubuntu18.04-${{matrix.cntoolkit_version}} ./build.sh --sub_module=bangc - name: bangc_ops_release_test_cases run: > docker run --rm --device /dev/cambricon_ctl --device /dev/cambricon_dev0 --device /dev/commu0 - -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:v0.4.0-devel-x86_64-ubuntu18.04 + -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:${{matrix.mlu_ops_version}}-devel-x86_64-ubuntu18.04-${{matrix.cntoolkit_version}} ./test.sh --sub_module=bangc --cases_dir=/testdata/release_test/default_platform - name: bangc_ops_release_temp_cases run: > docker run --rm --device /dev/cambricon_ctl --device /dev/cambricon_dev0 --device /dev/commu0 - -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:v0.4.0-devel-x86_64-ubuntu18.04 + -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:${{matrix.mlu_ops_version}}-devel-x86_64-ubuntu18.04-${{matrix.cntoolkit_version}} ./test.sh --sub_module=bangc --cases_dir=/testdata/release_temp/default_platform - name: test_bangc_ops_release_test_370_cases if: matrix.runner == 'mlu370-m8' run: > docker run --rm --device /dev/cambricon_ctl --device /dev/cambricon_dev0 --device /dev/commu0 - -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:v0.4.0-devel-x86_64-ubuntu18.04 + -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:${{matrix.mlu_ops_version}}-devel-x86_64-ubuntu18.04-${{matrix.cntoolkit_version}} ./test.sh --sub_module=bangc --cases_dir=/testdata/release_test/370 - name: test_bangc_ops_release_temp_370_cases if: matrix.runner == 'mlu370-m8' run: > docker run --rm --device /dev/cambricon_ctl --device /dev/cambricon_dev0 --device /dev/commu0 - -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:v0.4.0-devel-x86_64-ubuntu18.04 + -v /testdata:/testdata -v $(pwd):/work -w /work docker-user.gotgo.cc:30080/mlu-ops/mluops_ci:${{matrix.mlu_ops_version}}-devel-x86_64-ubuntu18.04-${{matrix.cntoolkit_version}} ./test.sh --sub_module=bangc --cases_dir=/testdata/release_temp/370 - name: clean diff --git a/.gitignore b/.gitignore index 98acb07ee..407499c51 100644 --- a/.gitignore +++ b/.gitignore @@ -208,3 +208,6 @@ cython_debug/ dep_libs_extract/ package/ test_workspace/ +daily.software.cambricon.com/ +dependency.txt +bangc-ops/symbol_visibility.map diff --git a/bangc-ops/kernels/copy/aarch64/copy_union1.mlu.o b/bangc-ops/kernels/copy/aarch64/copy_union1.mlu.o deleted file mode 100644 index 9aa61abf1..000000000 Binary files a/bangc-ops/kernels/copy/aarch64/copy_union1.mlu.o and /dev/null differ diff --git a/bangc-ops/kernels/copy/aarch64/copy_with_stride_union1.mlu.o b/bangc-ops/kernels/copy/aarch64/copy_with_stride_union1.mlu.o deleted file mode 100644 index fd10344d1..000000000 Binary files a/bangc-ops/kernels/copy/aarch64/copy_with_stride_union1.mlu.o and /dev/null differ diff --git a/bangc-ops/kernels/copy/x86_64/copy_union1.mlu.o b/bangc-ops/kernels/copy/x86_64/copy_union1.mlu.o index 4edc81aff..d34a6a8f4 100644 Binary files a/bangc-ops/kernels/copy/x86_64/copy_union1.mlu.o and b/bangc-ops/kernels/copy/x86_64/copy_union1.mlu.o differ diff --git a/bangc-ops/kernels/copy/x86_64/copy_with_stride_union1.mlu.o b/bangc-ops/kernels/copy/x86_64/copy_with_stride_union1.mlu.o index 0e0af2457..daf2a66e8 100644 Binary files a/bangc-ops/kernels/copy/x86_64/copy_with_stride_union1.mlu.o and b/bangc-ops/kernels/copy/x86_64/copy_with_stride_union1.mlu.o differ diff --git a/bangc-ops/kernels/expand/aarch64/expand_one_dim_union1.mlu.o b/bangc-ops/kernels/expand/aarch64/expand_one_dim_union1.mlu.o deleted file mode 100644 index 9d75d4fa9..000000000 Binary files a/bangc-ops/kernels/expand/aarch64/expand_one_dim_union1.mlu.o and /dev/null differ diff --git a/bangc-ops/kernels/expand/aarch64/expand_union1.mlu.o b/bangc-ops/kernels/expand/aarch64/expand_union1.mlu.o deleted file mode 100644 index 5c7246d17..000000000 Binary files a/bangc-ops/kernels/expand/aarch64/expand_union1.mlu.o and /dev/null differ diff --git a/bangc-ops/kernels/expand/expand.cpp b/bangc-ops/kernels/expand/expand.cpp index 967e19c16..4dfe72320 100644 --- a/bangc-ops/kernels/expand/expand.cpp +++ b/bangc-ops/kernels/expand/expand.cpp @@ -73,18 +73,18 @@ mluOpExpand(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, PARAM_CHECK("[mluOpExpand]", input != NULL); PARAM_CHECK("[mluOpExpand]", output != NULL); - uint64_t dims_input[MLUOP_DIM_MAX]; - uint64_t dims_output[MLUOP_DIM_MAX]; - uint64_t redims_input[MLUOP_DIM_MAX + 1]; - uint64_t redims_output[MLUOP_DIM_MAX + 1]; + int32_t dims_input[MLUOP_DIM_MAX]; + int32_t dims_output[MLUOP_DIM_MAX]; + int32_t redims_input[MLUOP_DIM_MAX + 1]; + int32_t redims_output[MLUOP_DIM_MAX + 1]; int32_t count_flag = 0; int32_t count_index[MLUOP_DIM_MAX + 1]; - int fix_num = 0; + int32_t fix_num = 0; size_t input_size = input_num; // Reshape dims: A(a, b, c) ---> A(1, 1, 1, 1, 1, a, b, c, 1) - for (int i = 0; i < MLUOP_DIM_MAX; i++) { + for (int32_t i = 0; i < MLUOP_DIM_MAX; ++i) { dims_input[i] = 1; dims_output[i] = 1; redims_input[i] = 1; @@ -93,19 +93,23 @@ mluOpExpand(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, redims_input[MLUOP_DIM_MAX] = 1; redims_output[MLUOP_DIM_MAX] = 1; - for (int i = 0; i < input_desc->dim; i++) { + for (int32_t i = 0; i < input_desc->dim; ++i) { dims_input[MLUOP_DIM_MAX - i - 1] = input_desc->dims[input_desc->dim - i - 1]; } - for (int i = 0; i < output_desc->dim; i++) { + for (int32_t i = 0; i < output_desc->dim; ++i) { dims_output[MLUOP_DIM_MAX - i - 1] = output_desc->dims[output_desc->dim - i - 1]; } - while (dims_output[MLUOP_DIM_MAX - 1 - fix_num] == 1) { - fix_num++; + for (int i = 0; i < MLUOP_DIM_MAX; ++i) { + if (dims_output[MLUOP_DIM_MAX - 1 - i] != 1) { + break; + } else { + fix_num++; + } } - for (int i = 0; i < MLUOP_DIM_MAX; i++) { + for (int32_t i = 0; i < MLUOP_DIM_MAX; ++i) { if (dims_output[i] % dims_input[i] != 0) { LOG(ERROR) << "[mluOpExpand] In expand dimension, the size of output" << " should be times of the size of input. But now in expand " @@ -117,7 +121,7 @@ mluOpExpand(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, } // Reshape: dims(1, A, 1, 1, B, 1) change to redims(A, 1, B) - for (int i = MLUOP_DIM_MAX - 1, j = fix_num; i - j >= 0; i--) { + for (int32_t i = MLUOP_DIM_MAX - 1, j = fix_num; i - j >= 0; --i) { redims_input[i] = dims_input[i - j]; redims_output[i] = dims_output[i - j]; while ((i - j) > 0 && dims_input[i - j] == 1 && @@ -129,7 +133,7 @@ mluOpExpand(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, size_t output_size = output_num; // Count how many dims need to expand. - for (int i = 0; i < MLUOP_DIM_MAX + 1; i++) { + for (int32_t i = 0; i < MLUOP_DIM_MAX + 1; ++i) { count_index[i] = 0; if (redims_input[i] != redims_output[i]) { count_flag += 1; @@ -162,7 +166,7 @@ mluOpExpand(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, cnrtFunctionType_t k_type; k_type = CNRT_FUNC_TYPE_UNION1; - int core_dim = mluop::runtime::getCoreNumOfEachUnionCapability(handle); + int32_t core_dim = mluop::runtime::getCoreNumOfEachUnionCapability(handle); int32_t union_number = mluop::runtime::getClusterLimitCapability(handle); k_dim.x = core_dim; k_dim.y = union_number; @@ -179,14 +183,14 @@ mluOpExpand(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, } if (count_flag == 1) { - uint64_t high_num = 1; - uint64_t expand_num = + int32_t high_num = 1; + int32_t expand_num = redims_output[count_index[0]] / redims_input[count_index[0]]; - uint64_t low_num = 1; - for (int i = 0; i < count_index[0]; i++) { + int32_t low_num = 1; + for (int32_t i = 0; i < count_index[0]; ++i) { high_num *= redims_output[i]; } - for (int i = count_index[0] + 1; i < MLUOP_DIM_MAX + 1; i++) { + for (int32_t i = count_index[0] + 1; i < MLUOP_DIM_MAX + 1; ++i) { low_num *= redims_output[i]; } if (redims_input[count_index[0]] != 1) { @@ -196,9 +200,8 @@ mluOpExpand(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, << k_type / CORE_DIM << ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>"; KERNEL_CHECK((mluOpUnion1KernelExpandOneDim( - k_dim, k_type, handle->queue, (void *)input, output, (uint32_t)high_num, - (uint32_t)expand_num, (uint32_t)low_num, - mluOpDataTypeBytes(data_type)))); + k_dim, k_type, handle->queue, (void *)input, (void *)output, high_num, + expand_num, low_num, mluOpDataTypeBytes(data_type)))); } else { INTERNAL_CHECK("mluOpExpand", MLUOP_STATUS_SUCCESS == policyFunc(handle, &k_dim, &k_type)); @@ -206,15 +209,11 @@ mluOpExpand(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, << k_type / CORE_DIM << ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>"; KERNEL_CHECK((mluOpUnion1KernelExpandTensor( - k_dim, k_type, handle->queue, (void *)input, output, - (uint32_t)dims_input[0], (uint32_t)dims_input[1], - (uint32_t)dims_input[2], (uint32_t)dims_input[3], - (uint32_t)dims_input[4], (uint32_t)dims_input[5], - (uint32_t)dims_input[6], (uint32_t)dims_input[7], - (uint32_t)dims_output[0], (uint32_t)dims_output[1], - (uint32_t)dims_output[2], (uint32_t)dims_output[3], - (uint32_t)dims_output[4], (uint32_t)dims_output[5], - (uint32_t)dims_output[6], (uint32_t)dims_output[7], + k_dim, k_type, handle->queue, (void *)input, (void *)output, + dims_input[0], dims_input[1], dims_input[2], dims_input[3], + dims_input[4], dims_input[5], dims_input[6], dims_input[7], + dims_output[0], dims_output[1], dims_output[2], dims_output[3], + dims_output[4], dims_output[5], dims_output[6], dims_output[7], mluOpDataTypeBytes(input_desc->dtype)))); } diff --git a/bangc-ops/kernels/expand/expand_mlu.mlu b/bangc-ops/kernels/expand/expand_mlu.mlu index b5e4639f1..2d7e1002e 100644 --- a/bangc-ops/kernels/expand/expand_mlu.mlu +++ b/bangc-ops/kernels/expand/expand_mlu.mlu @@ -23,37 +23,38 @@ #include "kernels/kernel.h" #include "mlu_op_kernel.h" -template __mlu_global__ void MLUUnion1KernelExpandTensor( - void *input, void *output, T input_1, T input_2, T input_3, T input_4, - T input_5, T input_6, T input_7, T input_8, T output_1, T output_2, - T output_3, T output_4, T output_5, T output_6, T output_7, T output_8, - int dtype_size); + void *input, void *output, int32_t input_1, int32_t input_2, + int32_t input_3, int32_t input_4, int32_t input_5, int32_t input_6, + int32_t input_7, int32_t input_8, int32_t output_1, int32_t output_2, + int32_t output_3, int32_t output_4, int32_t output_5, int32_t output_6, + int32_t output_7, int32_t output_8, int dtype_size); -template __mlu_global__ void MLUUnion1KernelExpandOneDim(void *input, void *output, - T high_num, T expand_num, - T low_num, int dtype_size); + int32_t high_num, + int32_t expand_num, + int32_t low_num, + int dtype_size); void MLUOP_WIN_API mluOpUnion1KernelExpandTensor( cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - const void *input, void *output, const uint32_t input_1, - const uint32_t input_2, const uint32_t input_3, const uint32_t input_4, - const uint32_t input_5, const uint32_t input_6, const uint32_t input_7, - const uint32_t input_8, const uint32_t output_1, const uint32_t output_2, - const uint32_t output_3, const uint32_t output_4, const uint32_t output_5, - const uint32_t output_6, const uint32_t output_7, const uint32_t output_8, + const void *input, void *output, const int32_t input_1, + const int32_t input_2, const int32_t input_3, const int32_t input_4, + const int32_t input_5, const int32_t input_6, const int32_t input_7, + const int32_t input_8, const int32_t output_1, const int32_t output_2, + const int32_t output_3, const int32_t output_4, const int32_t output_5, + const int32_t output_6, const int32_t output_7, const int32_t output_8, const int dtype_size) { - MLUUnion1KernelExpandTensor<<>>( - (void *)input, (void *)output, input_1, input_2, input_3, input_4, - input_5, input_6, input_7, input_8, output_1, output_2, output_3, - output_4, output_5, output_6, output_7, output_8, dtype_size); + MLUUnion1KernelExpandTensor<<>>( + (void *)input, output, input_1, input_2, input_3, input_4, input_5, + input_6, input_7, input_8, output_1, output_2, output_3, output_4, + output_5, output_6, output_7, output_8, dtype_size); } void MLUOP_WIN_API mluOpUnion1KernelExpandOneDim( cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - const void *input, void *output, const uint32_t high_num, - const uint32_t expand_num, const uint32_t low_num, const int dtype_size) { - MLUUnion1KernelExpandOneDim<<>>( - (void *)input, (void *)output, high_num, expand_num, low_num, dtype_size); + const void *input, void *output, const int32_t high_num, + const int32_t expand_num, const int32_t low_num, const int dtype_size) { + MLUUnion1KernelExpandOneDim<<>>( + (void *)input, output, high_num, expand_num, low_num, dtype_size); } diff --git a/bangc-ops/kernels/expand/x86_64/expand_one_dim_union1.mlu.o b/bangc-ops/kernels/expand/x86_64/expand_one_dim_union1.mlu.o index d87145ce9..33948f5e5 100644 Binary files a/bangc-ops/kernels/expand/x86_64/expand_one_dim_union1.mlu.o and b/bangc-ops/kernels/expand/x86_64/expand_one_dim_union1.mlu.o differ diff --git a/bangc-ops/kernels/expand/x86_64/expand_union1.mlu.o b/bangc-ops/kernels/expand/x86_64/expand_union1.mlu.o index e10ce2614..07774ab51 100644 Binary files a/bangc-ops/kernels/expand/x86_64/expand_union1.mlu.o and b/bangc-ops/kernels/expand/x86_64/expand_union1.mlu.o differ diff --git a/bangc-ops/kernels/fill/aarch64/fill_union1.mlu.o b/bangc-ops/kernels/fill/aarch64/fill_union1.mlu.o deleted file mode 100644 index 3fccea08e..000000000 Binary files a/bangc-ops/kernels/fill/aarch64/fill_union1.mlu.o and /dev/null differ diff --git a/bangc-ops/kernels/fill/aarch64/fill_with_stride_union1.mlu.o b/bangc-ops/kernels/fill/aarch64/fill_with_stride_union1.mlu.o deleted file mode 100644 index 37c424f1c..000000000 Binary files a/bangc-ops/kernels/fill/aarch64/fill_with_stride_union1.mlu.o and /dev/null differ diff --git a/bangc-ops/kernels/fill/fill_mlu.mlu b/bangc-ops/kernels/fill/fill_mlu.mlu index 49b46a6de..77e50f721 100644 --- a/bangc-ops/kernels/fill/fill_mlu.mlu +++ b/bangc-ops/kernels/fill/fill_mlu.mlu @@ -22,228 +22,124 @@ *************************************************************************/ #include "fill_mlu.h" +/****************************************************************************** + * Cambricon CNNL Data Type + ******************************************************************************/ +/* Enumeration variables describing the data types in Cambricon CNNL. */ +typedef enum { + CNNL_DTYPE_INVALID = 0, /*!< The data is an invalid data type. */ + CNNL_DTYPE_HALF = 1, /*!< The data is a 16-bit floating-point data type. */ + CNNL_DTYPE_FLOAT = 2, /*!< The data is a 32-bit floating-point data type. */ + CNNL_DTYPE_DOUBLE = 14, /*!< The data is a 64-bit floating-point data type. */ + CNNL_DTYPE_INT8 = 3, /*!< The data is a 8-bit signed integer data type. */ + CNNL_DTYPE_INT16 = 4, /*!< The data is a 16-bit signed integer data type. */ + CNNL_DTYPE_INT31 = 5, /*!< The data is a 31-bit signed integer data type. */ + CNNL_DTYPE_INT32 = 6, /*!< The data is a 32-bit signed integer data type. */ + CNNL_DTYPE_INT64 = 9, /*!< The data is a 64-bit signed integer data type. */ + CNNL_DTYPE_UINT8 = 7, /*!< The data is a 8-bit unsigned integer data type. */ + CNNL_DTYPE_UINT16 = + 13, /*!< The data is a 16-bit unsigned integer data type. */ + CNNL_DTYPE_UINT32 = + 11, /*!< The data is a 32-bit unsigned integer data type. */ + CNNL_DTYPE_UINT64 = + 12, /*!< The data is a 64-bit unsigned integer data type. */ + CNNL_DTYPE_BOOL = 8, /*!< The data is a boolean data type. */ + CNNL_DTYPE_COMPLEX_HALF = + 15, /*!< The data is a 32-bit complex number of two fp16. */ + CNNL_DTYPE_COMPLEX_FLOAT = + 16, /*!< The data is a 64-bit complex number of two fp32. */ +} cnnlDataType_t; + +// convert mluOpDataType_t to cnnlDataType_t +static inline cnnlDataType_t convertDatatype(mluOpDataType_t k_datatype) { + switch (k_datatype) { + case MLUOP_DTYPE_INVALID: + return CNNL_DTYPE_INVALID; + case MLUOP_DTYPE_HALF: + return CNNL_DTYPE_HALF; + case MLUOP_DTYPE_FLOAT: + return CNNL_DTYPE_FLOAT; + case MLUOP_DTYPE_DOUBLE: + return CNNL_DTYPE_DOUBLE; + case MLUOP_DTYPE_INT8: + return CNNL_DTYPE_INT8; + case MLUOP_DTYPE_INT16: + return CNNL_DTYPE_INT16; + case MLUOP_DTYPE_INT32: + return CNNL_DTYPE_INT32; + case MLUOP_DTYPE_INT64: + return CNNL_DTYPE_INT64; + case MLUOP_DTYPE_UINT8: + return CNNL_DTYPE_UINT8; + case MLUOP_DTYPE_UINT16: + return CNNL_DTYPE_UINT16; + case MLUOP_DTYPE_UINT32: + return CNNL_DTYPE_UINT32; + case MLUOP_DTYPE_UINT64: + return CNNL_DTYPE_UINT64; + case MLUOP_DTYPE_BOOL: + return CNNL_DTYPE_BOOL; + case MLUOP_DTYPE_COMPLEX_HALF: + return CNNL_DTYPE_COMPLEX_HALF; + case MLUOP_DTYPE_COMPLEX_FLOAT: + return CNNL_DTYPE_COMPLEX_FLOAT; + } +} + // FillDeviceValue -template __mlu_global__ void MLUUnion1KernelFillDeviceValue(void *output, size_t size, - const void *value); -__mlu_global__ void MLUUnion1KernelFillDeviceValueBool(void *output, - size_t size, - const void *value); -template -__mlu_global__ void MLUUnion1KernelFillDeviceValueInt64(void *output, - size_t size, - const void *value); + const void *value, + cnnlDataType_t k_datatype); + void MLUOP_WIN_API mluOpUnion1KernelFillDeviceValue( cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, mluOpDataType_t k_datatype, void *output, size_t size, const void *value) { - switch (k_datatype) { - case MLUOP_DTYPE_BOOL: { - MLUUnion1KernelFillDeviceValueBool<<>>(output, size, - value); - }; break; - case MLUOP_DTYPE_INT8: { - MLUUnion1KernelFillDeviceValue - <<>>(output, size, value); - }; break; - case MLUOP_DTYPE_UINT8: { - MLUUnion1KernelFillDeviceValue - <<>>(output, size, value); - }; break; - case MLUOP_DTYPE_INT16: { - MLUUnion1KernelFillDeviceValue - <<>>(output, size, value); - }; break; - case MLUOP_DTYPE_UINT16: { - MLUUnion1KernelFillDeviceValue - <<>>(output, size, value); - }; break; - case MLUOP_DTYPE_HALF: { - MLUUnion1KernelFillDeviceValue - <<>>(output, size, value); - }; break; - case MLUOP_DTYPE_FLOAT: { - MLUUnion1KernelFillDeviceValue - <<>>(output, size, value); - }; break; - case MLUOP_DTYPE_INT32: { - MLUUnion1KernelFillDeviceValue - <<>>(output, size, value); - }; break; - case MLUOP_DTYPE_UINT32: { - MLUUnion1KernelFillDeviceValue - <<>>(output, size, value); - }; break; - case MLUOP_DTYPE_INT64: { - MLUUnion1KernelFillDeviceValueInt64 - <<>>(output, size, value); - }; break; - case MLUOP_DTYPE_UINT64: { - MLUUnion1KernelFillDeviceValueInt64 - <<>>(output, size, value); - }; break; - default: { - LOG(ERROR) << "mluOpFill with " << k_datatype - << " data type is NOT implemented currently."; - }; break; - } + cnnlDataType_t cnnl_k_datatype = convertDatatype(k_datatype); + MLUUnion1KernelFillDeviceValue<<>>(output, size, value, + cnnl_k_datatype); } // FillHostValue -template __mlu_global__ void MLUUnion1KernelFillHostValue(void *output, size_t size, - uint32_t value); -__mlu_global__ void MLUUnion1KernelFillHostValueInt64(void *output, size_t size, - uint32_t value, - uint32_t value_high, - uint32_t value_low); + uint32_t value, + uint32_t value_high, + uint32_t value_low, + cnnlDataType_t k_datatype); + void MLUOP_WIN_API mluOpUnion1KernelFillHostValue( cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, mluOpDataType_t k_datatype, void *output, size_t size, uint32_t value, uint32_t value_high, uint32_t value_low) { - uint32_t fill_value = false; - switch (k_datatype) { - case MLUOP_DTYPE_BOOL: - fill_value = (bool)((uint8_t)value); - MLUUnion1KernelFillHostValue - <<>>(output, size, fill_value); - case MLUOP_DTYPE_INT8: - case MLUOP_DTYPE_UINT8: { - MLUUnion1KernelFillHostValue - <<>>(output, size, (uint32_t)value); - }; break; - case MLUOP_DTYPE_HALF: - case MLUOP_DTYPE_INT16: - case MLUOP_DTYPE_UINT16: { - MLUUnion1KernelFillHostValue - <<>>(output, size, (uint32_t)value); - }; break; - case MLUOP_DTYPE_FLOAT: - case MLUOP_DTYPE_INT32: - case MLUOP_DTYPE_UINT32: { - MLUUnion1KernelFillHostValue - <<>>(output, size, (uint32_t)value); - }; break; - case MLUOP_DTYPE_INT64: - case MLUOP_DTYPE_UINT64: { - MLUUnion1KernelFillHostValueInt64<<>>( - output, size, (uint32_t)value, value_high, value_low); - }; break; - default: { - LOG(ERROR) << "mluOpFill with " << k_datatype - << " data type is NOT implemented currently."; - }; break; - } + cnnlDataType_t cnnl_k_datatype = convertDatatype(k_datatype); + MLUUnion1KernelFillHostValue<<>>( + output, size, value, value_high, value_low, cnnl_k_datatype); } // FillDeviceValueWithStride -template __mlu_global__ void MLUUnion1KernelFillDeviceValueWithStride( - void *output, TensorShape output_shape, size_t size, const void *value); - -__mlu_global__ void MLUUnion1KernelFillDeviceValueWithStrideBool( - void *output, TensorShape output_shape, size_t size, const void *value); - -template -__mlu_global__ void MLUUnion1KernelFillDeviceValueWithStrideInt64( - void *output, TensorShape output_shape, size_t size, const void *value); + void *output, TensorShape output_shape, size_t size, const void *value, + cnnlDataType_t k_datatype); void MLUOP_WIN_API mluOpUnion1KernelFillDeviceValueWithStride( cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, mluOpDataType_t k_datatype, void *output, TensorShape output_shape, size_t size, const void *value) { - switch (k_datatype) { - case MLUOP_DTYPE_BOOL: { - MLUUnion1KernelFillDeviceValueWithStrideBool<<>>( - output, output_shape, size, value); - }; break; - case MLUOP_DTYPE_INT8: { - MLUUnion1KernelFillDeviceValueWithStride - <<>>(output, output_shape, size, value); - }; break; - case MLUOP_DTYPE_UINT8: { - MLUUnion1KernelFillDeviceValueWithStride - <<>>(output, output_shape, size, value); - }; break; - case MLUOP_DTYPE_INT16: { - MLUUnion1KernelFillDeviceValueWithStride - <<>>(output, output_shape, size, value); - }; break; - case MLUOP_DTYPE_UINT16: { - MLUUnion1KernelFillDeviceValueWithStride - <<>>(output, output_shape, size, value); - }; break; - case MLUOP_DTYPE_HALF: { - MLUUnion1KernelFillDeviceValueWithStride - <<>>(output, output_shape, size, value); - }; break; - case MLUOP_DTYPE_FLOAT: { - MLUUnion1KernelFillDeviceValueWithStride - <<>>(output, output_shape, size, value); - }; break; - case MLUOP_DTYPE_INT32: { - MLUUnion1KernelFillDeviceValueWithStride - <<>>(output, output_shape, size, value); - }; break; - case MLUOP_DTYPE_UINT32: { - MLUUnion1KernelFillDeviceValueWithStride - <<>>(output, output_shape, size, value); - }; break; - case MLUOP_DTYPE_INT64: { - MLUUnion1KernelFillDeviceValueWithStrideInt64 - <<>>(output, output_shape, size, value); - }; break; - case MLUOP_DTYPE_UINT64: { - MLUUnion1KernelFillDeviceValueWithStrideInt64 - <<>>(output, output_shape, size, value); - }; break; - default: { - LOG(ERROR) << "mluOpFill with " << k_datatype - << " data type is NOT implemented currently."; - }; break; - } + cnnlDataType_t cnnl_k_datatype = convertDatatype(k_datatype); + MLUUnion1KernelFillDeviceValueWithStride<<>>( + output, output_shape, size, value, cnnl_k_datatype); } // FillHostValueWithStride -template __mlu_global__ void MLUUnion1KernelFillHostValueWithStride( - void *output, TensorShape output_shape, size_t size, uint32_t value); - -__mlu_global__ void MLUUnion1KernelFillHostValueWithStrideUint64( void *output, TensorShape output_shape, size_t size, uint32_t value, - uint32_t value_high, uint32_t value_low); + uint32_t value_high, uint32_t value_low, cnnlDataType_t k_datatype); void MLUOP_WIN_API mluOpUnion1KernelFillHostValueWithStride( cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, mluOpDataType_t k_datatype, void *output, TensorShape output_shape, size_t size, uint32_t value, uint32_t value_high, uint32_t value_low) { - switch (k_datatype) { - case MLUOP_DTYPE_BOOL: - value = (bool)((uint8_t)value); - case MLUOP_DTYPE_INT8: - case MLUOP_DTYPE_UINT8: { - MLUUnion1KernelFillHostValueWithStride - <<>>(output, output_shape, size, value); - }; break; - case MLUOP_DTYPE_HALF: - case MLUOP_DTYPE_INT16: - case MLUOP_DTYPE_UINT16: { - MLUUnion1KernelFillHostValueWithStride - <<>>(output, output_shape, size, value); - }; break; - case MLUOP_DTYPE_FLOAT: - case MLUOP_DTYPE_INT32: - case MLUOP_DTYPE_UINT32: { - MLUUnion1KernelFillHostValueWithStride - <<>>(output, output_shape, size, value); - }; break; - case MLUOP_DTYPE_INT64: - case MLUOP_DTYPE_UINT64: { - MLUUnion1KernelFillHostValueWithStrideUint64<<>>( - output, output_shape, size, value, value_high, value_low); - }; break; - default: { - LOG(ERROR) << "mluOpFill with " << k_datatype - << " data type is NOT implemented currently."; - }; break; - } + cnnlDataType_t cnnl_k_datatype = convertDatatype(k_datatype); + MLUUnion1KernelFillHostValueWithStride<<>>( + output, output_shape, size, value, value_high, value_low, + cnnl_k_datatype); } diff --git a/bangc-ops/kernels/fill/x86_64/fill_union1.mlu.o b/bangc-ops/kernels/fill/x86_64/fill_union1.mlu.o index 9c947aa41..590d1f787 100644 Binary files a/bangc-ops/kernels/fill/x86_64/fill_union1.mlu.o and b/bangc-ops/kernels/fill/x86_64/fill_union1.mlu.o differ diff --git a/bangc-ops/kernels/fill/x86_64/fill_with_stride_union1.mlu.o b/bangc-ops/kernels/fill/x86_64/fill_with_stride_union1.mlu.o index 8eb632183..3e93b1b25 100644 Binary files a/bangc-ops/kernels/fill/x86_64/fill_with_stride_union1.mlu.o and b/bangc-ops/kernels/fill/x86_64/fill_with_stride_union1.mlu.o differ diff --git a/bangc-ops/mlu_op_kernel.h b/bangc-ops/mlu_op_kernel.h index 071fa5748..fae56441e 100644 --- a/bangc-ops/mlu_op_kernel.h +++ b/bangc-ops/mlu_op_kernel.h @@ -235,18 +235,18 @@ void MLUOP_WIN_API mluOpUnionKernelThreeInterpolateForwardHalf( /* Expand */ void MLUOP_WIN_API mluOpUnion1KernelExpandTensor( cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - const void *input, void *output, const uint32_t input_1, - const uint32_t input_2, const uint32_t input_3, const uint32_t input_4, - const uint32_t input_5, const uint32_t input_6, const uint32_t input_7, - const uint32_t input_8, const uint32_t output_1, const uint32_t output_2, - const uint32_t output_3, const uint32_t output_4, const uint32_t output_5, - const uint32_t output_6, const uint32_t output_7, const uint32_t output_8, + const void *input, void *output, const int32_t input_1, + const int32_t input_2, const int32_t input_3, const int32_t input_4, + const int32_t input_5, const int32_t input_6, const int32_t input_7, + const int32_t input_8, const int32_t output_1, const int32_t output_2, + const int32_t output_3, const int32_t output_4, const int32_t output_5, + const int32_t output_6, const int32_t output_7, const int32_t output_8, const int dtype_size); void MLUOP_WIN_API mluOpUnion1KernelExpandOneDim( cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, - const void *input, void *output, const uint32_t high_num, - const uint32_t expand_num, const uint32_t low_num, const int dtype_size); + const void *input, void *output, const int32_t high_num, + const int32_t expand_num, const int32_t low_num, const int dtype_size); /* Psamask */ typedef enum { diff --git a/docker/ubuntu18.04-x86_64/Dockerfile b/docker/ubuntu18.04-x86_64/Dockerfile index f8d5a8714..2d19ad931 100644 --- a/docker/ubuntu18.04-x86_64/Dockerfile +++ b/docker/ubuntu18.04-x86_64/Dockerfile @@ -13,7 +13,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ apt-get clean && rm -rf /var/lib/apt/lists/* -ARG CNTOOLKIT_VERSION="3.2.0-1" +ARG CNTOOLKIT_VERSION="3.0.2-1" # NOTE: the url below is only for internal usage, you could replace with your own cntoolkit package. ADD http://daily.software.cambricon.com/release/cntoolkit/Linux/x86_64/Ubuntu/18.04/${CNTOOLKIT_VERSION}/cntoolkit_${CNTOOLKIT_VERSION}.ubuntu18.04_amd64.deb /tmp/