Skip to content

Commit

Permalink
Merge branch 'master' into add_lite_interface
Browse files Browse the repository at this point in the history
  • Loading branch information
PetrelYy authored Dec 16, 2024
2 parents 3e2ca52 + acbe8c2 commit e454ee6
Show file tree
Hide file tree
Showing 41 changed files with 855 additions and 999 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/daily.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ jobs:
strategy:
matrix:
runner: [mlu370-m8]
mlu_ops_version : [1.4.0]
mlu_ops_version : [1.4.1]
cntoolkit_version : [3.15.2]
cnnl_version: [1.27.4]
cnnl_version: [1.28.0]
runs-on: ${{matrix.runner}}
steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/mluops_ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ jobs:
strategy:
matrix:
runner: [mlu370-m8]
mlu_ops_version : [v1.4.0]
mlu_ops_version : [v1.4.1]
runs-on: [yellow]
steps:
- uses: actions/checkout@v3
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,9 @@ MLU-OPS™提供了以下功能:
## 依赖条件

- 操作系统:
- 支持 x86_64 架构下的 Ubuntu20.04、Centos7.6、Centos8.5、Kylin10
- MLU-OPS™ v1.0.0版本后将不再支持 Ubuntu18.04。Ubuntu22.04系统将在后续的版本提供支持。
- 支持 x86_64 架构下的 Ubuntu22.04、Centos7.6、Centos8.5、Kylin10
- MLU-OPS™ v1.0.0版本后将不再支持 Ubuntu18.04。
- MLU-OPS™ v1.4.1版本后将不再支持 Ubuntu20.04。
- 寒武纪 MLU SDK:
- 编译和运行时依赖 CNToolkit v3.15.2 或更高版本,CNNL v1.27.4 或者更高版本
- 寒武纪 MLU 驱动:
Expand Down
4 changes: 2 additions & 2 deletions build.property
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"version": "1.4.0-1",
"version": "1.4.1-1",
"python": "3.6.0",
"build_requires": {"cntoolkit": ["release","3.15.2-1"],
"cnnl":["release","1.27.4-1"],
"cnnl":["release","1.28.0-1"],
"driver": "6.0.3",
"eigen3": "3.4.0",
"libxml2": "2.9.0",
Expand Down
47 changes: 24 additions & 23 deletions core/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@
#include "core/tool.h"
#include "kernels/kernel.h"

#define DEP_CHECK_LOG(level) \
#define DEP_CHECK_LOG(level) \
mluop::logging::LogMessage(__FILE__, __LINE__, 4, level, "MLU-OPS", true, \
true, true, true) \
true, true, true) \
.stream()

namespace mluop {
Expand All @@ -46,27 +46,23 @@ static struct deviceName name_list_table[] = {
// case.
};

// once cnrtGetDeviceProperties() update and not use
// device_ordinal, update this funciton.
mluOpDevType_t convertDeviceName(char *name) {
struct deviceName *pName = NULL;
int num = sizeof(name_list_table) / sizeof(struct deviceName);
if (CONTEXT_DEVICENAME_LEAST_SIZE > strlen(name)) {
LOG(ERROR)
<< "get device name failed. device name too short. device name = "
<< name << "\n";
return MLUOP_UNKNOWN_DEVICE;
}
for (int i = 0; i < num; i++) {
pName = &name_list_table[i];
if (0 == strncmp(pName->name, name, strlen(pName->name)) ||
(i == num - 1 &&
0 >= strncmp(pName->name, name, CONTEXT_DEVICENAME_LEAST_SIZE))) {
return pName->type;
mluOpDevType_t convertDeviceNameFromInt(int device_code) {
switch (device_code) {
case 372: {
return MLUOP_MLU370;
break;
}
case 592: {
return MLUOP_MLU590;
break;
}
case 613: {
return MLUOP_MTP613;
break;
}
default:
break;
}
LOG(ERROR) << "get device name failed. return unknown device. device name = "
<< name << "\n";
return MLUOP_UNKNOWN_DEVICE;
}
} // namespace mluop
Expand Down Expand Up @@ -179,6 +175,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpCreate(mluOpHandle_t *handle) {
int32_t persisting_l2cache_maxsize = 0;
double memory_band_width = 0;
char device_name[CONTEXT_DEVICENAME_BUFFER_SIZE] = "";
int device_code = 0;
mluOpContext *ctx = new (std::nothrow) mluOpContext();
CNcontext drv_ctx;
CNctxConfigParam ctx_conf_param;
Expand Down Expand Up @@ -246,6 +243,11 @@ mluOpStatus_t MLUOP_WIN_API mluOpCreate(mluOpHandle_t *handle) {
cnDeviceGetAttribute(&persisting_l2cache_maxsize,
CN_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE,
mlu_dev));
INTERNAL_CHECK(
"[mluOpCreate]",
CN_SUCCESS == cnDeviceGetAttribute(&device_code,
CN_DEVICE_ATTRIBUTE_MLU_ISA_VERSION,
mlu_dev));
INTERNAL_CHECK(
"[mluOpCreate]",
CN_SUCCESS == cnDeviceGetName(device_name, CONTEXT_DEVICENAME_BUFFER_SIZE,
Expand All @@ -266,8 +268,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpCreate(mluOpHandle_t *handle) {
}

ctx->capability_job_limit = (int32_t)ctx_conf_param.unionLimit;
ctx->arch = mluop::convertDeviceName(
device_name); // warning: possible return unknown.
ctx->arch = mluop::convertDeviceNameFromInt(device_code);
ctx->sram_size = sram_size - REM_FOR_STACK;

strncpy(ctx->device_name, device_name, sizeof(device_name));
Expand Down
1 change: 1 addition & 0 deletions core/context.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ typedef enum {
MLUOP_MLU270 = 270,
MLUOP_MLU370 = 372,
MLUOP_MLU590 = 592,
MLUOP_MTP613 = 613,
MLUOP_MLU290 = 290,
} mluOpDevType_t;

Expand Down
10 changes: 9 additions & 1 deletion docs/api_guide/update.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,17 @@ Update History

This section lists contents that were made for each product release.

* V1.4.1

**Date:** December 5, 2024

**Changes:**

- None.

* V1.4.0

**Date:** October 21, 2024
**Date:** November 29, 2024

**Changes:**

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,8 @@ __mlu_func__ void bilinearInterpolatePosWeight(
w3[i] += w3[j];
w4[i] += w4[j];
w1[j] = -1;
} else {
break;
}
}
if (unique_num != i) {
Expand Down Expand Up @@ -386,22 +388,21 @@ bin_hw_order_num = bin_order_num ^ 2。<br>
| pos4 | sizeof(uint) * bin_hw_order_num | pos4 坐标 |


剩余空间对齐均分为三份 vi, vi_t, val,记空间大小为 max_v_size。<br>
其中 vi 复用多次,最终的 val_sum 也存储于 vi 中。<br>
此时 max_once_c = max_v_size / unique_num / sizeof(T)。 <br>
剩余空间对齐均分为两份 val, v_t,记空间大小为 max_v_size。<br>
此时 max_once_c = max_v_size / 4 / unique_num / sizeof(T)。 <br>
以float 类型为例:
- 若 bin_order_num 为 32,固定的 size 为 53376, max_vi_size 为 113280
unique_num 最大可到 bin_hw_order_num(1024),此时 max_once_c = 27
- 若 bin_order_num 为 8,固定的 size 为 7296, max_vi_size 为 128640
unique_num 最大可到 bin_hw_order_num(64),此时 max_once_c = 502
- 若 bin_order_num 为 32,固定的 size 为 53376, max_vi_size 为 169920
unique_num 最大可到 bin_hw_order_num(1024),此时 max_once_c = 10
- 若 bin_order_num 为 8,固定的 size 为 7296, max_vi_size 为 192960
unique_num 最大可到 bin_hw_order_num(64),此时 max_once_c = 188


### 3.4 性能优化设计
1.向量化加速。
2.减少重复计算,例如:roi_info 计算,bin_h、bin_w 二维序列构建等。
3.使用 fuse.nram 融合三条以上的乘加法。
4.双线性插值坐标进行查重,减少 IO 的数量。

5.将周围四个点坐标搬运成连续向量,gather时一次性处理,在有效点较少时能提升 IO 效率。

### 3.5 可维护性设计

Expand Down
18 changes: 18 additions & 0 deletions docs/release_notes/mlu_ops.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,24 @@ Cambricon MLU-OPS具有以下特点:
+-----------------------------+------------------------+--------------------------------+


v1.4.1
-----------------

特性变更
~~~~~~~~~~~~~~~~~~~~~

- 无。

已修复问题
~~~~~~~~~~~~~~~~~~~~~

- 无。

已知遗留问题
~~~~~~~~~~~~~~~~~~~~~

- 无。

v1.4.0
-----------------

Expand Down
7 changes: 7 additions & 0 deletions docs/user_guide/2_update_history/index.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
更新历史
========

* **V1.4.1**
**更新时间**:2024年12月5日

**更新内容**:

- 无算子更新。

* **V1.4.0**
**更新时间**:2024年11月29日

Expand Down
4 changes: 4 additions & 0 deletions independent_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ long_args=(
help
mlu370 # mlu arch
mlu590
mtp613
no_prepare
perf
prepare
Expand All @@ -69,6 +70,9 @@ add_mlu_arch_support () {
--mlu590)
bang_arch="mtp_592;"
;;
--mtp613)
bang_arch="mtp_613;"
;;
*)
;;
esac
Expand Down
4 changes: 3 additions & 1 deletion installer/centos7.5/SPECS/mluops-independent.spec
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

Name: mluops
Summary: The Machine Lerning Unit OPerators
Version: 1.4.0
Version: 1.4.1
Release: 1%{?dist}
License: Cambricon Release License
Vendor: Cambricon Inc.
Expand Down Expand Up @@ -64,6 +64,8 @@ cp $RPM_SOURCE_DIR/neuware-env.conf $RPM_BUILD_ROOT/etc/ld.so.conf.d/
%postun -p /sbin/ldconfig

%changelog
* Thu Dec 5 2024 Cambricon Software Team <[email protected]>
- release mluops v1.4.1
* Thu Nov 29 2024 Cambricon Software Team <[email protected]>
- release mluops v1.4.0
* Mon Oct 21 2024 Cambricon Software Team <[email protected]>
Expand Down
8 changes: 7 additions & 1 deletion installer/independent/debian/changelog
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
mluops (1.4.1-1.ubuntu16.04) xenial; urgency=medium

* Release mluops v1.4.1

-- Cambricon <[email protected]> Thu, 5 Dec 2024 00:00:00 +0100

mluops (1.4.0-1.ubuntu16.04) xenial; urgency=medium

* Release mluops v1.4.0

-- Cambricon <[email protected]> Thu, 29 Nov 2024 00:00:00 +0100
-- Cambricon <[email protected]> Fri, 29 Nov 2024 00:00:00 +0100

mluops (1.3.2-1.ubuntu16.04) xenial; urgency=medium

Expand Down
28 changes: 15 additions & 13 deletions kernels/box_iou_rotated/box_iou_rotated_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#define KERNELS_BOX_IOU_ROTATED_BOX_IOU_ROTATED_UTILS_H_

#include "kernels/utils/common.h"
#include "kernels/utils/scatter_gather.h"

#define FIILED_ONES (int)0xffffffff
#define HALF_FILLED_ONES (int16_t)0xffff
Expand Down Expand Up @@ -590,21 +591,22 @@ __mlu_func__ void convexHullGraham(
sizeof(T), actual_compute_box_num);

// get the ordered points according to the angle value
__gather(ordered_pts_x + (i + 1) * actual_compute_box_num, intersect_pts_x,
(unsigned int *)temp_offset, sizeof(T), NRAM2NRAM, sizeof(T),
actual_compute_box_num);
__gather(ordered_pts_y + (i + 1) * actual_compute_box_num, intersect_pts_y,
(unsigned int *)temp_offset, sizeof(T), NRAM2NRAM, sizeof(T),
actual_compute_box_num);
__gather(temp_long_1 + (i + 1) * actual_compute_box_num, valid_pts,
(unsigned int *)temp_offset, sizeof(T), NRAM2NRAM, sizeof(T),
actual_compute_box_num);
__mluop_gather<T>(ordered_pts_x + (i + 1) * actual_compute_box_num,
intersect_pts_x, (unsigned int *)temp_offset, NULL,
sizeof(T), NRAM2NRAM, sizeof(T), actual_compute_box_num);
__mluop_gather<T>(ordered_pts_y + (i + 1) * actual_compute_box_num,
intersect_pts_y, (unsigned int *)temp_offset, NULL,
sizeof(T), NRAM2NRAM, sizeof(T), actual_compute_box_num);
__mluop_gather<T>(temp_long_1 + (i + 1) * actual_compute_box_num, valid_pts,
(unsigned int *)temp_offset, NULL, sizeof(T), NRAM2NRAM,
sizeof(T), actual_compute_box_num);

// assign a invalid value to the point which has been get ordered
__scatter(temp_long_2, temp1_ram, (unsigned int *)temp_offset, sizeof(T),
NRAM2NRAM, sizeof(T), actual_compute_box_num);
__scatter(valid_pts, temp2_ram, (unsigned int *)temp_offset, sizeof(T),
NRAM2NRAM, sizeof(T), actual_compute_box_num);
__mluop_scatter<T>(temp_long_2, temp1_ram, (unsigned int *)temp_offset,
NULL, sizeof(T), NRAM2NRAM, sizeof(T),
actual_compute_box_num);
__mluop_scatter<T>(valid_pts, temp2_ram, (unsigned int *)temp_offset, NULL,
sizeof(T), NRAM2NRAM, sizeof(T), actual_compute_box_num);
}
__bang_move(valid_pts, temp_long_1, total_points * sizeof(T));
#else
Expand Down
Loading

0 comments on commit e454ee6

Please sign in to comment.