Skip to content

Commit

Permalink
[Fix](bangc-ops): revise docs according to review.
Browse files Browse the repository at this point in the history
  • Loading branch information
DanieeelLiu committed Oct 16, 2023
1 parent a983d68 commit 2a7e111
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 25 deletions.
21 changes: 12 additions & 9 deletions bangc-ops/kernels/box_iou_rotated/box_iou_rotated_aligned.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
#include "kernels/utils/common.h"

// macros defined in kernel.h
// #define COMPUTE_COUNT_ALIGN 64 // elem_count must be divisible by 64
#define COMPUTE_COUNT_ALIGN_BOX_TRANS 48 // elem_count must be divisible by 64
// #define CEIL_ALIGN(x, align) (((x) + (align) - 1) / (align) * (align))
// #define FLOOR_ALIGN(x, align) ((x) / (align) * (align))

Expand Down Expand Up @@ -67,11 +67,11 @@ __mlu_func__ void MLUUnion1BoxIouRotatedAligned(const T *box1, const T *box2,

void *box1_trans = nram_buffer + 4 * max_box_pair * sizeof(T);
void *box2_trans =
nram_buffer + (1 * COMPUTE_COUNT_ALIGN + 4) * max_box_pair * sizeof(T);
nram_buffer + (1 * COMPUTE_COUNT_ALIGN_BOX_TRANS + 4) * max_box_pair * sizeof(T);
void *box1_onchip =
nram_buffer + (2 * COMPUTE_COUNT_ALIGN + 4) * max_box_pair * sizeof(T);
nram_buffer + (2 * COMPUTE_COUNT_ALIGN_BOX_TRANS + 4) * max_box_pair * sizeof(T);
void *box2_onchip =
nram_buffer + (3 * COMPUTE_COUNT_ALIGN + 4) * max_box_pair * sizeof(T);
nram_buffer + (2 * COMPUTE_COUNT_ALIGN_BOX_TRANS + COMPUTE_COUNT_ALIGN + 4) * max_box_pair * sizeof(T);

// After transpose, box1/2_onchip data can be over-written
void *temp1_ram = (char *)box2_onchip;
Expand Down Expand Up @@ -154,12 +154,15 @@ __mlu_func__ void MLUUnion1BoxIouRotatedAligned(const T *box1, const T *box2,

// Initialize valid_box, set actual_box_num boxes to 1, else set to 0
__bang_write_value((T *)valid_box, actual_compute_box_num, (T)1);

__bang_write_zero(valid_box + actual_box_num,
actual_compute_box_num - actual_box_num);
// 这里优化
if (actual_box_num < actual_compute_box_num) {
for (uint32_t i = actual_box_num; i < actual_compute_box_num; i++) {
((T *)valid_box)[i] = 0;
}
}
// if (actual_box_num < actual_compute_box_num) {
// for (uint32_t i = actual_box_num; i < actual_compute_box_num; i++) {
// ((T *)valid_box)[i] = 0;
// }
// }

// Each box data: x, y, w, h, a
// area1 = box1.h * box1.w;
Expand Down
34 changes: 18 additions & 16 deletions bangc-ops/kernels/box_iou_rotated/box_iou_rotated_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -481,19 +481,21 @@ __mlu_func__ void convexHullGraham(
__bang_mul((T *)temp2_ram, (T *)temp2_ram, (T *)valid_box,
actual_compute_box_num);

T* temp_test_long3 = ordered_pts_x;

// set p[i].x to max_x_value if not min_y point
__bang_argmax((T *)temp1_ram, (T *)intersect_pts_x,
24 * actual_compute_box_num);
__bang_cycle_eq((T *)temp_long_1, (T *)temp_long_2, (T *)temp2_ram,
24 * actual_compute_box_num, actual_compute_box_num);
__bang_and((T *)temp_long_1, (T *)temp_long_1, (T *)valid_pts,
24 * actual_compute_box_num);
__bang_not((T *)temp_long_3, (T *)temp_long_1, 24 * actual_compute_box_num);
__bang_mul_scalar((T *)temp_long_3, (T *)temp_long_3,
__bang_not((T *)temp_test_long3, (T *)temp_long_1, 24 * actual_compute_box_num);
__bang_mul_scalar((T *)temp_test_long3, (T *)temp_test_long3,
(T)((T *)temp1_ram)[0], 24 * actual_compute_box_num);
__bang_mul((T *)temp_long_1, (T *)intersect_pts_x, (T *)temp_long_1,
24 * actual_compute_box_num);
__bang_add((T *)temp_long_1, (T *)temp_long_1, (T *)temp_long_3,
__bang_add((T *)temp_long_1, (T *)temp_long_1, (T *)temp_test_long3,
24 * actual_compute_box_num);
// temp3 = min_x_value(temp_long_1), use min_pool, channel=box_num, h=1,
// w=24
Expand Down Expand Up @@ -550,7 +552,7 @@ __mlu_func__ void convexHullGraham(
// If the angles are the same, sort according to distance to origin
dot2d<T>((T *)dist_ram, (T *)intersect_pts_x, (T *)intersect_pts_y,
(T *)intersect_pts_x, (T *)intersect_pts_y,
24 * actual_compute_box_num, (T *)temp_long_3);
24 * actual_compute_box_num, (T *)temp_test_long3);
// for (k = 1; k < num_in; k++) {
// if (dist[k] > 1e-8) {
// break;
Expand Down Expand Up @@ -611,7 +613,7 @@ __mlu_func__ void convexHullGraham(
// __bang_printf("%f ", temp_long_2[k*actual_compute_box_num + offset]);
// }
// __bang_printf("\nangle value\n");
__bang_maxpool_value_index((T *)temp_long_3, (T *)temp_long_2,
__bang_maxpool_value_index((T *)dist_ram, (T *)temp_long_2,
actual_compute_box_num, 1, 24, 1, 24, 1, 24,
actual_compute_box_num * sizeof(T));
// for (int k = 0; k < actual_compute_box_num; k++)
Expand All @@ -623,42 +625,42 @@ __mlu_func__ void convexHullGraham(
// __bang_printf("%d ", ((uint *)(temp_long_3 + actual_compute_box_num))[k]);
// }
// __bang_printf("\actual_box_num: %d actual_compute_box_num: %d\n", actual_box_num, actual_compute_box_num);
__bang_write_value((uint*)(temp_long_3 + real_compute_box_num + actual_compute_box_num), actual_compute_box_num, (uint)23);
__bang_write_value((uint*)(dist_ram + real_compute_box_num + actual_compute_box_num), actual_compute_box_num, (uint)23);

// for (int k = 0; k < actual_compute_box_num; k++) {
// __bang_printf("%d ", ((uint *)(temp_long_3 + actual_compute_box_num))[k]);
// }
// __bang_printf("\nmaxpool_index\n");
__bang_mul_scalar((unsigned int *)(temp_long_3 + 3 * actual_compute_box_num),(unsigned int *)(temp_long_3 + actual_compute_box_num), actual_compute_box_num, actual_compute_box_num);
__bang_mul_scalar((unsigned int *)(dist_ram + 3 * actual_compute_box_num),(unsigned int *)(dist_ram + actual_compute_box_num), actual_compute_box_num, actual_compute_box_num);
// for (int k = 0; k < actual_compute_box_num; k++) {
// __bang_printf("%d ", ((uint *)(temp_long_3 + 3 * actual_compute_box_num))[k]);
// }
// __bang_printf("\nmaxpool_index fixed 1\n");
__bang_add((unsigned int *)(temp_long_3 + 3 * actual_compute_box_num),
(unsigned int *)(temp_long_3 + 3 * actual_compute_box_num),
__bang_add((unsigned int *)(dist_ram + 3 * actual_compute_box_num),
(unsigned int *)(dist_ram + 3 * actual_compute_box_num),
(unsigned int *)temp3_ram, actual_compute_box_num);
__bang_mul_scalar((unsigned int *)(temp_long_3 + 3 * actual_compute_box_num),(unsigned int *)(temp_long_3 + 3 * actual_compute_box_num), sizeof(T), actual_compute_box_num);
__bang_mul_scalar((unsigned int *)(dist_ram + 3 * actual_compute_box_num),(unsigned int *)(dist_ram + 3 * actual_compute_box_num), sizeof(T), actual_compute_box_num);

__bang_sub_scalar((unsigned int *)(temp_long_3 + 72),(unsigned int *)(temp_long_3 + 72), 4, 24);
__bang_sub_scalar((unsigned int *)(dist_ram + 72),(unsigned int *)(dist_ram + 72), 4, 24);
// for (int k = 0; k < actual_compute_box_num; k++) {
// __bang_printf("%d ", ((uint *)(temp_long_3 + 3 * actual_compute_box_num))[k]);
// }
// __bang_printf("\nmaxpool_index fixed 2\n");
__gather(ordered_pts_x + (i+1) * actual_compute_box_num, intersect_pts_x , (unsigned int *)(temp_long_3 + 3 * actual_compute_box_num), sizeof(T),
__gather(ordered_pts_x + (i+1) * actual_compute_box_num, intersect_pts_x , (unsigned int *)(dist_ram + 3 * actual_compute_box_num), sizeof(T),
NRAM2NRAM, sizeof(T), actual_compute_box_num);
__gather(ordered_pts_y + (i+1) * actual_compute_box_num, intersect_pts_y, (unsigned int *)(temp_long_3 + 3 * actual_compute_box_num), sizeof(T),
__gather(ordered_pts_y + (i+1) * actual_compute_box_num, intersect_pts_y, (unsigned int *)(dist_ram + 3 * actual_compute_box_num), sizeof(T),
NRAM2NRAM, sizeof(T), actual_compute_box_num);
__gather(temp_long_1 + (i+1) * actual_compute_box_num, valid_pts, (unsigned int *)(temp_long_3 + 3 * actual_compute_box_num), sizeof(T),
__gather(temp_long_1 + (i+1) * actual_compute_box_num, valid_pts, (unsigned int *)(dist_ram + 3 * actual_compute_box_num), sizeof(T),
NRAM2NRAM, sizeof(T), actual_compute_box_num);
// __bang_printf("\n valid_pts: %f \n", temp_long_1[(i+1) * actual_compute_box_num]);

// for (int k = 0; k < 24; k++) {
// __bang_printf("%f ", temp1_ram[k]);
// }
// __bang_printf("\ntemp_long1\n");
__scatter(temp_long_2, temp1_ram, (unsigned int *)(temp_long_3 + 3 * actual_compute_box_num),
__scatter(temp_long_2, temp1_ram, (unsigned int *)(dist_ram + 3 * actual_compute_box_num),
sizeof(T), NRAM2NRAM, sizeof(T), actual_compute_box_num);
__scatter(valid_pts, temp2_ram, (unsigned int *)(temp_long_3 + 3 * actual_compute_box_num),
__scatter(valid_pts, temp2_ram, (unsigned int *)(dist_ram + 3 * actual_compute_box_num),
sizeof(T), NRAM2NRAM, sizeof(T), actual_compute_box_num);
// for (int k = 0; k < 24; k++)
// {
Expand Down

0 comments on commit 2a7e111

Please sign in to comment.