diff --git a/core/cnlog.cpp b/core/cnlog.cpp index b282d367e..20b1bcd71 100644 --- a/core/cnlog.cpp +++ b/core/cnlog.cpp @@ -423,9 +423,9 @@ void LogMessage::printHead(bool is_colored) { fmt::arg("module", is_colored ? fmt::to_string(fmt::styled( - "MLUOP", fmt::emphasis::bold | + "MLU-OPS", fmt::emphasis::bold | fmt::fg(fmt::terminal_color::yellow))) - : "MLUOP"), + : "MLU-OPS"), fmt::arg("severity", formatSeverity(logSeverity_, is_colored)), fmt::arg("pid", getpid_()), fmt::arg("card", []() { int dev_index = -1; diff --git a/core/context.cpp b/core/context.cpp index 6c8ec406f..dc1cea168 100644 --- a/core/context.cpp +++ b/core/context.cpp @@ -30,7 +30,7 @@ #include "kernels/kernel.h" #define DEP_CHECK_LOG(level) \ - mluop::logging::LogMessage(__FILE__, __LINE__, 4, level, "MLUOP", true, \ + mluop::logging::LogMessage(__FILE__, __LINE__, 4, level, "MLU-OPS", true, \ true, true, true) \ .stream() diff --git a/core/gen_case.h b/core/gen_case.h index e815c29ff..7e60c9de7 100644 --- a/core/gen_case.h +++ b/core/gen_case.h @@ -39,8 +39,8 @@ #define MLUOP_GEN_CASE_ON_NEW (mluop::gen_case::genCaseModeGet(true) > 0) #define GEN_CASE_START(op_name, op_type) \ - mluop::gen_case::PbNode *node = mluop::gen_case::genCaseStart(op_name, \ - op_type) + mluop::gen_case::PbNode *node = \ + mluop::gen_case::genCaseStart(op_name, op_type) #define GEN_CASE_DATA(is_input, id, data, data_desc, upper_bound, lower_bound) \ mluop::gen_case::genCaseData(node, is_input, id, data, data_desc, \ @@ -252,7 +252,10 @@ class PbNode { op_param.name = param_node_name; if (dtype == MLUOP_DTYPE_HALF) { op_param.params.push_back( - {param_name, std::to_string(castHalfToFloat32(param_value))}); + {param_name, get_float_string_of_half_or_bf16(¶m_value, dtype)}); + } else if (dtype == MLUOP_DTYPE_BFLOAT16) { + op_param.params.push_back( + {param_name, get_float_string_of_half_or_bf16(¶m_value, dtype)}); } else if (std::is_same::value || std::is_same::value) { op_param.params.push_back( @@ -360,11 +363,28 @@ class PbNode { return " value_i: "; } } + + inline std::string get_float_string_of_half_or_bf16(void *data, + mluOpDataType_t dtype) { + char buffer[128]; + float dst = 0.0; + if (MLUOP_DTYPE_HALF == dtype) { + cnrtCastDataType_V2(data, cnrtHalf, &dst, cnrtFloat, 1, nullptr, + cnrtRounding_rm); + } else if (MLUOP_DTYPE_BFLOAT16 == dtype) { + cnrtCastDataType_V2(data, cnrtBfloat, &dst, cnrtFloat, 1, nullptr, + cnrtRounding_rm); + } + std::snprintf(buffer, sizeof(buffer), "%.9g", dst); + return std::string(buffer); + } + inline std::string get_data_string(mluOpDataType_t dtype, void *data, uint64_t offset) { switch (dtype) { case MLUOP_DTYPE_HALF: - return std::to_string(castHalfToFloat32(((int16_t *)data)[offset])); + return get_float_string_of_half_or_bf16(((int16_t *)data) + offset, + dtype); case MLUOP_DTYPE_BFLOAT16: return std::to_string(((uint16_t *)data)[offset]); case MLUOP_DTYPE_FLOAT: @@ -372,7 +392,8 @@ class PbNode { case MLUOP_DTYPE_DOUBLE: return std::to_string(((double *)data)[offset]); case MLUOP_DTYPE_COMPLEX_HALF: - return std::to_string(castHalfToFloat32(((int16_t *)data)[offset])); + return get_float_string_of_half_or_bf16(((int16_t *)data) + offset, + dtype); case MLUOP_DTYPE_COMPLEX_FLOAT: return std::to_string(((float *)data)[offset]); case MLUOP_DTYPE_INT8: @@ -474,9 +495,9 @@ class PbNode { (tensors[index].desc->pointer_mode == MLUOP_POINTER_MODE_HOST ? cnrtMemcpyHostToHost : cnrtMemcpyDevToHost); - if (cnrtSuccess == - cnrtMemcpy(data, const_cast(tensors[index].device_ptr), - data_size, memcpy_dir)) { + if (cnrtSuccess == cnrtMemcpy(data, + const_cast(tensors[index].device_ptr), + data_size, memcpy_dir)) { return data; } else { LOG(ERROR) << "[gen_case] Dump data failed! cnrtMemcpy data size is " @@ -539,8 +560,7 @@ inline void PbNode::appendOpParam(std::string param_name, if (attr.type == cnrtMemTypeDevice) { void *data = malloc(data_width); if (cnrtSuccess == cnrtMemcpy(data, const_cast(param_value), - data_width, - cnrtMemcpyDevToHost)) { + data_width, cnrtMemcpyDevToHost)) { op_param.params.push_back({param_name, get_data_string(dtype, data, 0)}); } else { LOG(ERROR) << "[gen_case] dump op param failed, param_name is " diff --git a/docs/MLU-OPS-Compile-Develop-And-Test.md b/docs/MLU-OPS-Compile-Develop-And-Test.md index 9939ba04e..b526c8531 100644 --- a/docs/MLU-OPS-Compile-Develop-And-Test.md +++ b/docs/MLU-OPS-Compile-Develop-And-Test.md @@ -1,17 +1,20 @@ # MLU-OPS™ 算子编译、开发与测试 ## 编译 Operators + - 环境依赖准备 环境准备参看“依赖环境准备”章节。 - 在mlu-ops目录下,可以使用以下命令完成环境变量的设置。 + ```sh cd mlu-ops mlu-ops$ source env.sh ``` - 编译所有算子 + ```sh cd mlu-ops mlu-ops$ ./build.sh @@ -38,11 +41,11 @@ 当算子存在正反向,且在kernel下的同一个文件夹下实现时 - 文件结构 - + `mlu-ops/kernels/op_name`、`mlu-ops/test/mlu_op_gtest/pb_gtest/src/zoo/op_name_forward(op_name_backward)`、`mlu-ops/test/mlu_op_gtest/api_gtest/src/gtest/op_name_forward(op_name_backward)` - 添加依赖 - + 在[kernel_depends.toml](../kernel_depends.toml)文件中的[gtest]下添加依赖说明 ```sh @@ -63,11 +66,11 @@ - 编译指定MLU板卡 - ```sh - mlu-ops$ ./build.sh # 编译多架构的版本,libmluops.so 体积较大,cncc使用多arch的cnfatbin封装 - mlu-ops$ ./build.sh --mlu370 # 编译 MLU370 板卡专用版本,cncc使用选项--bang-mlu-arch=mtp_372 - mlu-ops$ ./build.sh --mlu370 --filter="abs;div" # mlu370 下编译 abs 算子和 div 算子 - ``` + ```sh + mlu-ops$ ./build.sh # 编译多架构的版本,libmluops.so 体积较大,cncc使用多arch的cnfatbin封装 + mlu-ops$ ./build.sh --mlu370 # 编译 MLU370 板卡专用版本,cncc使用选项--bang-mlu-arch=mtp_372 + mlu-ops$ ./build.sh --mlu370 --filter="abs;div" # mlu370 下编译 abs 算子和 div 算子 + ``` - kernel_depends.toml @@ -80,32 +83,35 @@ - gen_symbol_visibility_map.py - `gen_symbol_visibility_map.py`脚本用于解析`mlu_op.h`头文件,获取函数名,生成`symbol_visibility.map`配置文件。 + ```sh MLUOP_ABI { - global: op1_func;op2_func; - local: *; + global: op1_func;op2_func; + local: *; }; ``` + global:表示符号是全局的(外部的) local:表示符号是本地的,即对外不可见 + - 执行build.sh编译时,将自动执行`gen_symbol_visibility_map.py`生成`symbol_visibility.map`配置文件。 + - 在编译阶段依据`symbol_visibility.map`文件中global字段定义的符号表,将动态库`libmluops.so`中除global中定义的符号外其他符号定义为local。 - 命令行参数 可通过`./build.sh -h`或`./build.sh --help`,查看命令行参数 - | 变量名 | 默认值 | 说明 | 关联cmake选项 | 关联命令行参数 | - | --------------------------- | ---------------------------------- | ------------------------------------------------------ | --------------------------- | ------------------------------------ | - | `BUILD_MODE` | release | release/debug,编译模式 | `CMAKE_BUILD_TYPE` | -d
--debug | - | `NEUWARE_HOME` | 用户声明,或`source env.sh`设置 | neuware路径,包含cnrt,cndrv | `NEUWARE_HOME` | | - | `MLUOP_BUILD_COVERAGE_TEST` | OFF | 代码覆盖率测试 | `MLUOP_BUILD_COVERAGE_TEST` | -c
--coverage | - | `MLUOP_BUILD_ASAN_CHECK` | OFF | 开启ASAN内存检查工具 | `MLUOP_BUILD_ASAN_CHECK` | --asan | - | `MLUOP_MLU_ARCH_LIST` | `mtp_372` | 目标mlu架构列表,分号分割的字符串,如"mtp_372" | `MLUOP_MLU_ARCH_LIST` | --mlu370 | - | `MLUOP_BUILD_SPECIFIC_OP` | 空 | 编译指定的算子 | `MLUOP_BUILD_SPECIFIC_OP` | --filter | - | `BUILD_JOBS` | 16 | 编译指定的线程数 | `BUILD_JOBS` | -j
--jobs | - - + | 变量名 | 默认值 | 说明 | 关联cmake选项 | 关联命令行参数 | + | --------------------------- | ------------------------------- | ------------------------------------------------------------ | --------------------------- | ------------------ | + | `BUILD_MODE` | release | release/debug,编译模式 | `CMAKE_BUILD_TYPE` | -d
--debug | + | `NEUWARE_HOME` | 用户声明,或`source env.sh`设置 | neuware路径,包含cnrt,cndrv | `NEUWARE_HOME` | | + | `MLUOP_MLU_ARCH_LIST` | `mtp_372` | 目标mlu架构列表,分号分割的字符串,如"mtp_372" | `MLUOP_MLU_ARCH_LIST` | --mlu370 | + | `BUILD_JOBS` | 16 | 编译指定的线程数 | `BUILD_JOBS` | -j
--jobs | + | `MLUOP_BUILD_COVERAGE_TEST` | OFF | 代码覆盖率测试 | `MLUOP_BUILD_COVERAGE_TEST` | -c
--coverage | + | `MLUOP_BUILD_ASAN_CHECK` | OFF | 开启ASAN内存检查工具 | `MLUOP_BUILD_ASAN_CHECK` | --asan | + | `MLUOP_BUILD_PERF` | OFF | 开启性能分析,结合工具[cnperf-cli](https://www.cambricon.com/docs/sdk_1.15.0/cntoolkit_3.7.2/cnperf_5.7.2/)使用分析算子内资源的调度 | `MLUOP_BUILD_PERF` | -p
--perf | + | `MLUOP_BUILD_SPECIFIC_OP` | 空 | 编译指定的算子 | `MLUOP_BUILD_SPECIFIC_OP` | --filter | ## 运行测试用例 @@ -141,18 +147,18 @@ mlu-ops$ source env_gencase_set.sh on mlu-ops$ source env_gencase_set.sh off ``` -| | 环境变量 | 功能说明 | 使用方法 | 备注 | -|---|------------------------|---------------------------------------------------------|----|-----------------------------------------| -| 1 | MLUOP_BUILD_GTEST | 编译MLU-OPS™ 的GTEST| ON时使能,其他情况不使能 | 在build脚本中默认设为ON | -| 2 | MLUOP_GTEST_DUMP_DATA | 将MLU-OPS™ 的GTEST的输入输出数据打印至文件中| ON: 保存 GTEST 测试过程中用到的输入输出数据 | 不使用此环境变量时需要unset环境变量 | -| 3 | MLUOP_GEN_CASE |运行前设置,设置gen_case工具功能等级 |0: 关闭 gen_case 模块功能;
1: 生成 prototxt,输入输出只保留 shape 等信息(GEN_CASE_DATA_REAL 将无效);
2: 生成 prototxt,并保留输入真实值;
3: 不生成 prototxt,只在屏幕上打印输入输出的 shape 等信息;
详情见: [Gencase-User-Guide-zh.md](./Gencase-User-Guide-zh.md)| | -| 4 | MLUOP_MIN_LOG_LEVEL | 设置外部LOG()宏的最小打印级别,用来让外部用户屏蔽不需要的LOG|0: enable INFO/WARNING/ERROR/FATAL;
1: enable WARNING/ERROR/FATAL;
2: enable ERROR/FATAL;
3: enable FATAL |默认为0 | -| 5 | MLUOP_MIN_VLOG_LEVEL |设置内部VLOG()宏的最小打印级别,用来控制软件内部不同层级调试需要的LOG |0: enable VLOG(0);
1: enable VLOG(0)-VLOG(1);
2: enable VLOG(0)-VLOG(2);
3: enable VLOG(0)-VLOG(3);
4: enable VLOG(0)-VLOG(4);
5: enable VLOG(0)-VLOG(5);
6: enable VLOG(0)-VLOG(6);
7: enable VLOG(0)-VLOG(7); | 默认为0| -| 6 | MLUOP_LOG_ONLY_SHOW | 是否之展示LOG 而不生成mluop_auto_log 文件 |=ON时,表示不会生产mluop_auto_log文件;
=OFF时,表示会生成mluop_auto_log文件 | 默认为ON| -| 7 | MLUOP_LOG_COLOR_PRINT | 决定打印LOG是否开启颜色字体特效 |=ON时,表示打印带颜色的字体加粗等特效;
=OFF时,表示关闭打印字体特效 | 默认为ON,但重定向到文件时,不会带颜色字体特效| -| 8 | MLUOP_BUILD_ASAN_CHECK | 在编译的时候设置是否打开ASAN内存检查 |=ON时,表示编译ASAN内存检查;
!=ON时,表示不编译ASAN内存检查 | 1.默认不开启
2.该工具仅在Ubuntu上与Debian上有效。无论环境变量如何设置,Centos上都不会编译该工具。
3.如果没有检测到内存问题,运行算子case时将不会输出任何内容; 若检测到内存问题,运行算子case时将输出错误内容。| -|9|MLUOP_SET_JOB_LIMIT_CAPABILITY|设置最大JOB限制数量,默认不设置。|=1 CN_KERNEL_CLASS_UNION
=2 CN_KERNEL_CLASS_UNION2
=3 CN_KERNEL_CLASS_UNION4
=4 CN_KERNEL_CLASS_UNION8
=5 CN_KERNEL_CLASS_UNION16
=6 CN_KERNEL_CLASS_BLOCK不使用
=7 CN_KERNEL_CLASS_NONE不使用
|JOB_LIMIT和CLUSTER_LIMIT需要同时设置来保证合法性| -|10|MLUOP_GTEST_CLUSTER_LIMIT_CAPABILITY|设置最大cluster限制数量,默认不设置|=1 1cluster
=3 2cluster
=7 3cluster
=15 4cluster
...
从右往左,每多一个连续的1表示1个cluster |JOB_LIMIT 和CLUSTER_LIMIT 需要同时设置来保证合法性
原理是:
1的二进制是0000,0001: 1号cluster可用
3的二进制是0000,0011: 1号和2好cluster可用
...
如果有特殊需求,如只想用2号cluster:设置为2: 0000,0010| -|11|MLUOP_GTEST_SET_GDRAM|作用是在GDRAM前后刷NAN/INF| NAN/INF 在GDRAM前后刷NAN/INF|若不设置则根据日期,偶数天刷NAN,奇数天刷INF| -|12|MLUOP_GTEST_UNALIGNED_ADDRESS_RANDOM|设置在GDRAM上申请的空间地址是非64 bytes对齐的,偏移量为1~63的随机值| ON/OFF || -|13|MLUOP_GTEST_UNALIGNED_ADDRESS_SET|设置在GDRAM上申请的空间地址是64 bytes对齐的| = NUM || +| | 环境变量 | 功能说明 | 使用方法 | 备注 | +| ---- | ------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | +| 1 | MLUOP_BUILD_GTEST | 编译MLU-OPS™ 的GTEST | ON时使能,其他情况不使能 | 在build脚本中默认设为ON | +| 2 | MLUOP_GTEST_DUMP_DATA | 将MLU-OPS™ 的GTEST的输入输出数据打印至文件中 | ON: 保存 GTEST 测试过程中用到的输入输出数据 | 不使用此环境变量时需要unset环境变量 | +| 3 | MLUOP_GEN_CASE | 运行前设置,设置gen_case工具功能等级 | 0: 关闭 gen_case 模块功能;
1: 生成 prototxt,输入输出只保留 shape 等信息(GEN_CASE_DATA_REAL 将无效);
2: 生成 prototxt,并保留输入真实值;
3: 不生成 prototxt,只在屏幕上打印输入输出的 shape 等信息;
详情见: [Gencase-User-Guide-zh.md](./Gencase-User-Guide-zh.md) | | +| 4 | MLUOP_MIN_LOG_LEVEL | 设置外部LOG()宏的最小打印级别,用来让外部用户屏蔽不需要的LOG | 0: enable INFO/WARNING/ERROR/FATAL;
1: enable WARNING/ERROR/FATAL;
2: enable ERROR/FATAL;
3: enable FATAL | 默认为0 | +| 5 | MLUOP_MIN_VLOG_LEVEL | 设置内部VLOG()宏的最小打印级别,用来控制软件内部不同层级调试需要的LOG | 0: enable VLOG(0);
1: enable VLOG(0)-VLOG(1);
2: enable VLOG(0)-VLOG(2);
3: enable VLOG(0)-VLOG(3);
4: enable VLOG(0)-VLOG(4);
5: enable VLOG(0)-VLOG(5);
6: enable VLOG(0)-VLOG(6);
7: enable VLOG(0)-VLOG(7); | 默认为0 | +| 6 | MLUOP_LOG_ONLY_SHOW | 是否之展示LOG 而不生成mluop_auto_log 文件 | =ON时,表示不会生产mluop_auto_log文件;
=OFF时,表示会生成mluop_auto_log文件 | 默认为ON | +| 7 | MLUOP_LOG_COLOR_PRINT | 决定打印LOG是否开启颜色字体特效 | =ON时,表示打印带颜色的字体加粗等特效;
=OFF时,表示关闭打印字体特效 | 默认为ON,但重定向到文件时,不会带颜色字体特效 | +| 8 | MLUOP_BUILD_ASAN_CHECK | 在编译的时候设置是否打开ASAN内存检查 | =ON时,表示编译ASAN内存检查;
!=ON时,表示不编译ASAN内存检查 | 1.默认不开启
2.该工具仅在Ubuntu上与Debian上有效。无论环境变量如何设置,Centos上都不会编译该工具。
3.如果没有检测到内存问题,运行算子case时将不会输出任何内容; 若检测到内存问题,运行算子case时将输出错误内容。 | +| 9 | MLUOP_SET_JOB_LIMIT_CAPABILITY | 设置最大JOB限制数量,默认不设置。 | =1 CN_KERNEL_CLASS_UNION
=2 CN_KERNEL_CLASS_UNION2
=3 CN_KERNEL_CLASS_UNION4
=4 CN_KERNEL_CLASS_UNION8
=5 CN_KERNEL_CLASS_UNION16
=6 CN_KERNEL_CLASS_BLOCK不使用
=7 CN_KERNEL_CLASS_NONE不使用
| JOB_LIMIT和CLUSTER_LIMIT需要同时设置来保证合法性 | +| 10 | MLUOP_GTEST_CLUSTER_LIMIT_CAPABILITY | 设置最大cluster限制数量,默认不设置 | =1 1cluster
=3 2cluster
=7 3cluster
=15 4cluster
...
从右往左,每多一个连续的1表示1个cluster | JOB_LIMIT 和CLUSTER_LIMIT 需要同时设置来保证合法性
原理是:
1的二进制是0000,0001: 1号cluster可用
3的二进制是0000,0011: 1号和2好cluster可用
...
如果有特殊需求,如只想用2号cluster:设置为2: 0000,0010 | +| 11 | MLUOP_GTEST_SET_GDRAM | 作用是在GDRAM前后刷NAN/INF | NAN/INF 在GDRAM前后刷NAN/INF | 若不设置则根据日期,偶数天刷NAN,奇数天刷INF | +| 12 | MLUOP_GTEST_UNALIGNED_ADDRESS_RANDOM | 设置在GDRAM上申请的空间地址是非64 bytes对齐的,偏移量为1~63的随机值 | ON/OFF | | +| 13 | MLUOP_GTEST_UNALIGNED_ADDRESS_SET | 设置在GDRAM上申请的空间地址是64 bytes对齐的 | = NUM | | \ No newline at end of file diff --git a/docs/design_docs/logspace/logspace.md b/docs/design_docs/logspace/logspace.md index 3ede40a9d..f9f3d76dc 100644 --- a/docs/design_docs/logspace/logspace.md +++ b/docs/design_docs/logspace/logspace.md @@ -305,7 +305,11 @@ const int32_t remain_steps = cur_core_num % max_deal_num; #### 3.1.3 其他计算逻辑 -考虑到与 cuda 对齐,除上述 2 个主要分支外,还进行如下划分: +考虑到与 cuda nan/inf 对齐,设计了其他分支。 + +cuda powf 计算逻辑:https://docs.nvidia.com/cuda/cuda-math-api/cuda_math_api/group__CUDA__MATH__SINGLE.html#group__cuda__math__single_1gab519b517c0036b3604d602f716a919dd + +除前述 2 个主要分支外,还进行如下划分: `steps`为 0,不调用 kernel ,直接返回。 @@ -313,17 +317,59 @@ kernel 内对分支进行如下划分。 1. `steps`为 1,直接计算 $base^{start}$; -2. `start`与`end`同时为 0 或同时为 inf,或者`base`为 1。结果均为 1 或 nan,填充数值; +2. `start`与`end`同时为 0 ,或至少 1 个为 inf,或者`base`为 1。 -3. `start`与`end`仅有 1 个为 inf,根据 cuda 的输出,结果填充 0,inf,nan 的组合; + ```c++ + else if ((scalar_start == 0 && scalar_end == 0) || base == 1 || + (abs(scalar_start) == INFINITY) || (abs(scalar_end) == INFINITY)) { + dealAllResultsOneOrNanOrHalfSpecial(scalar_start, scalar_end, steps, base, + res); + } + ``` -4. `base`等于0,根据`start`与`end`的正负,在结果中填充 0 和 inf 的组合; + 结果分为四段,根据分段计算结果填充 0,1,inf,nan 的组合。 + + ```c++ + float step = (float)(end - start) / (steps - 1); + float base_start = powf(base, start + step * 0); + float base_first_half = powf(base, start + step * 1); + float base_second_half = powf(base, end - step * 1); + float base_end = powf(base, end - step * 0); + + int64_t halfway = steps / 2; + setResult(res, 1, (T)base_start); + setResult(res + 1, halfway - 1, (T)base_first_half); + setResult(res + halfway, (steps + 1) / 2 - 1, (T)base_second_half); + setResult(res + steps - 1, 1, (T)base_end); + ``` + + 理由如下:Pytorch 中,logspace 算子分为前后两段计算。 -5. 间隔 step 等于 0,或在 half 类型下间隔过小。前一半结果为 $base^{start}$ ,后一半为 $base^{end}$; + index 小于 steps / 2 时,计算 pow(base, start + step * index); + + index 大于等于 steps / 2 时,计算 pow(base, end - step * (steps - index - 1))。 + + 对应上述代码中的 base_first_half 与 base_second_half。 + + 另外,当间隔 step 为 inf 时,对于起始位置有 + + step*index = inf*0 = nan + + 对于结束位置 + + step*(steps - index - 1) = inf*0 = nan + + 因此起始位置与结束位置单独计算,对应上述代码的 base_start 与 base_end。 -6. 负底数分支,见 3.1.2; + 综上,本分支分为 4 段,根据分段计算结果填充 0,1,inf,nan 的组合; -7. 正底数以及底数为 nan 分支,见 3.1.1。 +3. `base`等于0,根据`start`与`end`的正负,在结果中填充 0,inf,nan 的组合; + +4. 间隔 step 等于 0,或在 half 类型下间隔过小。前一半结果为 $base^{start}$ ,后一半为 $base^{end}$; + +5. 负底数分支,见 3.1.2; + +6. 正底数以及底数为 nan 分支,见 3.1.1。 ### 3.2 伪代码实现 @@ -342,17 +388,15 @@ kernel 内对分支进行如下划分。 2. `steps`为 1,直接计算并返回 $base^{start}$; -3. `start`与`end`同时为 0 或同时为 inf,或者`base`为 1。结果均为 1 或 nan,填充数值; - -4. `start`与`end`仅有 1 个为 inf,结果中填充 0,inf,nan 的组合; +3. `start`与`end`同时为 0 或至少 1 个为 inf,或者`base`为 1。结果为 0,1,inf,nan 的组合,分为4段计算并填充数值; -5. `base`等于 0 ,此时根据`start`与`end`的正负,结果中填充 0 和 inf 的组合; +4. `base`等于 0 ,此时根据`start`与`end`的正负,结果中填充 0,inf,nan 的组合; -6. 间隔 step 等于 0,或在 half 类型下间隔过小。结果中前一半填充 $base^{start}$ ,后一半填充 $base^{end}$; +5. 间隔 step 等于 0,或在 half 类型下间隔过小。结果中前一半填充 $base^{start}$ ,后一半填充 $base^{end}$; -7. `base`小于 0。指数为整数时,pow(x,y) = (-1)^y * 2 ^(y * log2 |x|);指数为小数时,pow(x,y) = nan; +6. `base`小于 0。指数为整数时,pow(x,y) = (-1)^y * 2 ^(y * log2 |x|);指数为小数时,pow(x,y) = nan; -8. `base`大于 0,以及输入中存在 nan。直接计算 pow(x,y) = 2 ^(y * log2 (x))。 +7. `base`大于 0,以及输入中存在 nan。直接计算 pow(x,y) = 2 ^(y * log2 (x))。 diff --git a/kernels/generate_proposals_v2/generate_proposals_v2_nms_utils.h b/kernels/generate_proposals_v2/generate_proposals_v2_nms_utils.h index dad6fd156..fadcacbff 100644 --- a/kernels/generate_proposals_v2/generate_proposals_v2_nms_utils.h +++ b/kernels/generate_proposals_v2/generate_proposals_v2_nms_utils.h @@ -309,7 +309,7 @@ __mlu_func__ void nonMaximumSuppress( // inter_y1| inter_x2 | inter_y2 | // | N | N | N | - int32_t *loop_end_flag = (int32_t *)(sram_buffer + 28); + int32_t *loop_end_flag = (int32_t *)(sram_buffer + 128); loop_end_flag[0] = 0; // scores, boxes, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2 const int memory_block = 13; diff --git a/kernels/generate_proposals_v2/generate_proposals_v2_union1_500.mlu b/kernels/generate_proposals_v2/generate_proposals_v2_union1_500.mlu index 3d9b8620a..bf5887b03 100644 --- a/kernels/generate_proposals_v2/generate_proposals_v2_union1_500.mlu +++ b/kernels/generate_proposals_v2/generate_proposals_v2_union1_500.mlu @@ -320,7 +320,8 @@ __mlu_func__ void ProposalForOneImage( variances, min_size, pixel_offset, proposals_score, proposals_box, proposals_temp, topk, hwa, collect_num, &proposals_num); - rpn_rois_num[0] = 0; + + int rpn_num = 0; if (proposals_num == 0) { rpn_rois_num[0] = 1; one_image_proposals_num[0] += rpn_rois_num[0]; @@ -330,11 +331,13 @@ __mlu_func__ void ProposalForOneImage( if (taskDim != 1) { __sync_all_ipu(); } - nonMaximumSuppress(rpn_rois, rpn_roi_probs, rpn_rois_num, proposals_score, + nonMaximumSuppress(rpn_rois, rpn_roi_probs, &rpn_num, proposals_score, proposals_box, proposals_temp, nms_thresh, post_nms_top_n, proposals_num, pixel_offset, hwa); - - one_image_proposals_num[0] += rpn_rois_num[0]; + if (taskId == 0) { + rpn_rois_num[0] = rpn_num; + one_image_proposals_num[0] += rpn_num; + } } template @@ -373,7 +376,9 @@ __mlu_global__ void mluOpGenerateProposalsV2Kernel( pixel_offset, hwa, topk); all_proposals_num += one_image_proposals_num; } - *rpn_rois_batch_size = all_proposals_num; + if (taskId == 0) { + *rpn_rois_batch_size = all_proposals_num; + } } mluOpStatus_t MLUOP_WIN_API KernelGenerateProposalsV2( diff --git a/kernels/logspace/logspace.cpp b/kernels/logspace/logspace.cpp index ea7963e05..d898dd65f 100644 --- a/kernels/logspace/logspace.cpp +++ b/kernels/logspace/logspace.cpp @@ -92,6 +92,8 @@ mluOpLogspace(mluOpHandle_t handle, const float start, const float end, if (MLUOP_GEN_CASE_ON_NEW) { GEN_CASE_START("logspace", "LOGSPACE"); GEN_CASE_HANDLE(handle); + GEN_CASE_DATA(true, "input", nullptr, nullptr, 0, 0); + GEN_CASE_DATA(false, "res", res, res_desc, 0, 0); GEN_CASE_OP_PARAM_SINGLE(0, "logspace", "start", start); GEN_CASE_OP_PARAM_SINGLE(1, "logspace", "end", end); GEN_CASE_OP_PARAM_SINGLE(2, "logspace", "steps", steps); diff --git a/kernels/logspace/logspace_block.mlu b/kernels/logspace/logspace_block.mlu index 5976f6e21..22aa81500 100644 --- a/kernels/logspace/logspace_block.mlu +++ b/kernels/logspace/logspace_block.mlu @@ -84,60 +84,28 @@ __mlu_func__ void dealStep1(const float start, const float base, T *res) { } template -__mlu_func__ void dealAll1Nan(const float start, const float end, - const int64_t steps, T *res) { - float all_the_result = 1; - if ((abs(start) == INFINITY) && (abs(end) == INFINITY)) { - all_the_result = NAN; - } - setResult(res, steps, (T)all_the_result); -} +__mlu_func__ void dealAllResultsOneOrNanOrHalfSpecial(const float start, + const float end, + const int64_t steps, + const float base, + T *res) { + float step = (float)(end - start) / (steps - 1); + float base_start = powf(base, start + step * 0); + float base_first_half = powf(base, start + step * 1); + float base_second_half = powf(base, end - step * 1); + float base_end = powf(base, end - step * 0); -template -__mlu_func__ void dealStartEndInfinity(const float start, const float end, - const int64_t steps, const float base, - T *res) { - if (abs(start) == INFINITY) { - if (steps > 2) { - float part_result = powf(base, start); - if (std::is_same::value) { - if (base < 0) { - part_result = INFINITY; - } else { - part_result = NAN; - } - } - setResult(res + steps / 2, (steps - 1) / 2, (T)part_result); - } - setResult(res, steps / 2, (T)NAN); - setResult(res + steps - 1, 1, (T)NAN); - } else if (abs(end) == INFINITY) { - if (steps > 2) { - float part_result = powf(base, end); - if (std::is_same::value) { - if (base < 0) { - part_result = INFINITY; - } else { - part_result = NAN; - } - } - setResult(res + 1, (steps - 1) / 2, (T)part_result); - } - setResult(res, 1, (T)NAN); - setResult(res + (steps + 1) / 2, steps / 2, (T)NAN); - } + int64_t halfway = steps / 2; + setResult(res, 1, (T)base_start); + setResult(res + 1, halfway - 1, (T)base_first_half); + setResult(res + halfway, (steps + 1) / 2 - 1, (T)base_second_half); + setResult(res + steps - 1, 1, (T)base_end); } template __mlu_func__ void dealBase0(const float start, const float end, const int64_t steps, T *res) { - if (start * end > 0) { - if (start > 0) { - setResult(res, steps, (T)0); - } else { - setResult(res, steps, (T)INFINITY); - } - } else { + if (start * end <= 0) { float step = (float)(end - start) / (steps - 1); int numbers_form_start_to_0 = abs(start / step) + 1; if (start > 0) { @@ -149,6 +117,18 @@ __mlu_func__ void dealBase0(const float start, const float end, setResult(res + numbers_form_start_to_0, steps - numbers_form_start_to_0, (T)0); } + } else { + float step = (float)(end - start) / (steps - 1); + float base_start = powf(0, start + step * 0); + float base_first_half = powf(0, start + step * 1); + float base_second_half = powf(0, end - step * 1); + float base_end = powf(0, end - step * 0); + + int64_t halfway = steps / 2; + setResult(res, 1, (T)base_start); + setResult(res + 1, halfway - 1, (T)base_first_half); + setResult(res + halfway, (steps + 1) / 2 - 1, (T)base_second_half); + setResult(res + steps - 1, 1, (T)base_end); } } @@ -354,11 +334,9 @@ __mlu_global__ void MLUKernelLogspace(const float start, const float end, if (steps == 1) { dealStep1(scalar_start, base, res); } else if ((scalar_start == 0 && scalar_end == 0) || base == 1 || - ((abs(scalar_start) == INFINITY) && - (abs(scalar_end) == INFINITY))) { - dealAll1Nan(scalar_start, scalar_end, steps, res); - } else if (abs(scalar_start) == INFINITY || abs(scalar_end) == INFINITY) { - dealStartEndInfinity(scalar_start, scalar_end, steps, base, res); + (abs(scalar_start) == INFINITY) || (abs(scalar_end) == INFINITY)) { + dealAllResultsOneOrNanOrHalfSpecial(scalar_start, scalar_end, steps, base, + res); } else if (base == 0) { dealBase0(scalar_start, scalar_end, steps, res); } else if ((abs((float)(scalar_end - scalar_start) / (steps - 1) == 0)) || diff --git a/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_forward.mlu b/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_forward.mlu index 2f4906d97..6e77c53a4 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_forward.mlu +++ b/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_forward.mlu @@ -245,12 +245,13 @@ mluOpStatus_t MLUOP_WIN_API mluOpMsDeformAttnForward( // set handle dump mlu output GEN_CASE_HANDLE(handle); GEN_CASE_DATA(true, "data_value", data_value, data_value_desc, 10, -10); - GEN_CASE_DATA(true, "data_spatial_shapes", data_spatial_shapes, - data_spatial_shapes_desc, 10, -10); - GEN_CASE_DATA(true, "data_level_start_index", data_level_start_index, - data_level_start_index_desc, 10, -10); - GEN_CASE_DATA(true, "data_sampling_loc", data_sampling_loc, - data_sampling_loc_desc, 10, -10); + GEN_CASE_DATA_REAL_V2(true, "data_spatial_shapes", data_spatial_shapes, + data_spatial_shapes_desc, 10, 0); + GEN_CASE_DATA_REAL_V2(true, "data_level_start_index", + data_level_start_index, data_level_start_index_desc, + 10, 0); + GEN_CASE_DATA_REAL_V2(true, "data_sampling_loc", data_sampling_loc, + data_sampling_loc_desc, 10, -10); GEN_CASE_DATA(true, "data_attn_weight", data_attn_weight, data_attn_weight_desc, 10, -10); GEN_CASE_DATA(false, "data_col", data_col, data_col_desc, 0, 0); diff --git a/kernels/rotated_feature_align/rotated_feature_align_block.mlu b/kernels/rotated_feature_align/rotated_feature_align_block.mlu index 58de5d8a5..9ec60b490 100644 --- a/kernels/rotated_feature_align/rotated_feature_align_block.mlu +++ b/kernels/rotated_feature_align/rotated_feature_align_block.mlu @@ -233,6 +233,7 @@ __mlu_global__ void MLUKernelRotatedFeatureAlignForward( const int bboxes_offset = n * width * height * offset_rois + ph * width * offset_rois + pw * offset_rois; const T *cur_bboxes_next = bboxes + bboxes_offset; + __sync(); __memcpy_async((T *)data_nram, cur_bboxes_next, offset_rois * sizeof(T), GDRAM2NRAM); } @@ -423,6 +424,7 @@ __mlu_global__ void MLUKernelRotatedFeatureAlignForward( ph * width * offset_rois + pw * offset_rois; const T *cur_bboxes_next = bboxes + bboxes_offset; // load next bboxes + __sync(); __memcpy_async((T *)data_nram, cur_bboxes_next, offset_rois * sizeof(T), GDRAM2NRAM); } diff --git a/kernels/voxelization/voxelization_kernel.mlu b/kernels/voxelization/voxelization_kernel.mlu index 0d76c106e..04f5580e7 100644 --- a/kernels/voxelization/voxelization_kernel.mlu +++ b/kernels/voxelization/voxelization_kernel.mlu @@ -197,13 +197,14 @@ __mlu_global__ void mluDynamicVoxelize(const float *points, contain_nan_inf ? INT_MIN : round((coors_z_max - coors_z_min) / voxel_z); const int32_t split_num = 9; - const int32_t deal_num = - FLOOR_ALIGN(MAX_NRAM_SIZE / split_num / sizeof(float), NFU_ALIGN_SIZE); + // voxel_size + coors_range need 36 bytes on nram. + const int32_t deal_num = FLOOR_ALIGN( + (MAX_NRAM_SIZE - 36) / split_num / sizeof(float), NFU_ALIGN_SIZE); const int32_t repeat = points_per_core / deal_num; const int32_t rem = points_per_core % deal_num; const int32_t ping_pong_gap = 3 * deal_num * sizeof(float); - int8_t *points_x = nram_buffer; + int8_t *points_x = nram_buffer + 36; int8_t *points_y = points_x + deal_num * sizeof(float); int8_t *points_z = points_y + deal_num * sizeof(float); int8_t *auxiliary_a = points_x + 2 * ping_pong_gap; @@ -292,6 +293,7 @@ __mlu_global__ void mluDynamicVoxelize(const float *points, deal_num * sizeof(int32_t), NRAM2GDRAM); } if (rem > 0) { + __sync(); __memcpy_async(points_x + (repeat % 2) * ping_pong_gap, points + (points_start + repeat * deal_num) * num_features, sizeof(float), GDRAM2NRAM, sizeof(float), @@ -509,6 +511,7 @@ __mlu_global__ void mluCalcPointsPerVoxel( int32_t deal_num_align8 = PAD_UP(deal_num, 8); // step1: load p2p/p2v data. + __sync_compute(); __memcpy_async(nram_p2p_idx, point_to_pointidx + points_offset, deal_num * sizeof(int32_t), GDRAM2NRAM); __memcpy(nram_p2v_idx, point_to_voxelidx + points_offset,