diff --git a/core/runtime/device.h b/core/runtime/device.h index fe8aed516..5bf367b1e 100644 --- a/core/runtime/device.h +++ b/core/runtime/device.h @@ -119,7 +119,7 @@ inline int32_t getClusterNumberOfJobLimitCapability(mluOpHandle_t handle) { inline cnrtFunctionType_t castCnKernelClassToCnrtFuncType(KernelClass jobType) { switch (jobType) { default: - return CNRT_FUNC_TYPE_MUTABLE; + return cnrtFuncTypeMutable; case CN_KERNEL_CLASS_BLOCK: return cnrtFuncTypeBlock; case CN_KERNEL_CLASS_UNION: diff --git a/docs/design_docs/moe_dispatch_backward_gate/moe_dispatch_backward_gate.md b/docs/design_docs/moe_dispatch_backward_gate/moe_dispatch_backward_gate.md index aac151304..1a5fec426 100644 --- a/docs/design_docs/moe_dispatch_backward_gate/moe_dispatch_backward_gate.md +++ b/docs/design_docs/moe_dispatch_backward_gate/moe_dispatch_backward_gate.md @@ -476,7 +476,7 @@ for (int i = 0; i < samples; ++i) { int sample_idx = 0; // 从 workspace load所有中间计算结果 T *nram_grad_gates = (T *)nram_buffer; - __bang_write_zero(nram_grad_gates, samples); + __bang_write_value(nram_grad_gates, samples, (T)0); for (int ti = 0; ti < taskDim; ti++) { if ((rem_task > 0) && (ti < (one_sample_task_num + 1) * rem_task)) { sample_idx = (int)(ti / (one_sample_task_num + 1)); @@ -567,7 +567,7 @@ for (int i = 0; i < samples; ++i) { // 复用nram_location空间 T *nram_grad_gates = (T*)nram_location; - __bang_write_zero(nram_grad_gates, deal_s_num); + __bang_write_value(nram_grad_gates, deal_s_num, 0); // 三级流水计算过程 // step4 diff --git a/docs/design_docs/ms_deform_attn_backward/ms_deform_attn_backward.md b/docs/design_docs/ms_deform_attn_backward/ms_deform_attn_backward.md index 5dc407284..2002b873e 100644 --- a/docs/design_docs/ms_deform_attn_backward/ms_deform_attn_backward.md +++ b/docs/design_docs/ms_deform_attn_backward/ms_deform_attn_backward.md @@ -264,8 +264,8 @@ void msDeformAttnCol2imBilinear(){ __memcpy(top_grad, grad_output, deal_num * sizeof(T), GDRAM2NRAM); __bang_mul_scalar(top_grad_temp, top_grad, attn_weight, deal_num); - __bang_write_zero(grad_h_weight, deal_num); - __bang_write_zero(grad_w_weight, deal_num); + __bang_write_value(grad_h_weight, deal_num, 0); + __bang_write_value(grad_w_weight, deal_num, 0); if (h_low >= 0 && w_low >= 0) { const int32_t offset1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; __memcpy(grad_output_nram, grad_output + offset1, deal_num * sizeof(T), GDRAM2NRAM); diff --git a/docs/design_docs/points_in_boxes/points_in_boxes.md b/docs/design_docs/points_in_boxes/points_in_boxes.md index 1f6b83a83..20fe47b0c 100644 --- a/docs/design_docs/points_in_boxes/points_in_boxes.md +++ b/docs/design_docs/points_in_boxes/points_in_boxes.md @@ -303,7 +303,7 @@ void points_in_boxes_kernel(int batch_size, int boxes_num, int pts_num, const fl X = points[0]; Y = points[m]; Z = points[2*m]; - bang_write_zero(last, 0); + bang_write_value(last, 0, 0); loop boxes for t in range(boxes_num): boxes = b * T * 7 + t * 7; (cx, cy, cz, dx, dy, dz, rz) = boxes[0:7]; diff --git a/docs/design_docs/prior_box/prior_box_design_doc.md b/docs/design_docs/prior_box/prior_box_design_doc.md index 9068cfd81..9ed64ec0a 100644 --- a/docs/design_docs/prior_box/prior_box_design_doc.md +++ b/docs/design_docs/prior_box/prior_box_design_doc.md @@ -394,7 +394,7 @@ mluOpPriorBox(mluOpHandle_t handle, 对`one_loop_pixel_num`循环处理,一次循环只初始化一个点的`num_priors`个框的坐标,设该点在`feature_map`上的索引为`pixel_index` - - 调用__bang_write_zero()将`boxes`置为0,即 boxes = [0,0,0,0,0,0,0,0] + - 调用__bang_write_value()将`boxes`置为0,即 boxes = [0,0,0,0,0,0,0,0] - 计算当前处理的点的位置,x_index = pixel_index % width,y_index = pixel_index / width,图中x_index = 0,1,y_index = 0,1,2。 - 将`x_index`(上图中x的坐标)和`x_mask`相乘,得到`tmp_x`(tmp_x为x_index,x_mask相乘的结果),tmp_x = [x_index,0,x_index,0,x_index,0,x_index,0] diff --git a/docs/design_docs/roipoint_pool3d/roipoint_pool3d.md b/docs/design_docs/roipoint_pool3d/roipoint_pool3d.md index 60d1f2d98..965d3eb09 100644 --- a/docs/design_docs/roipoint_pool3d/roipoint_pool3d.md +++ b/docs/design_docs/roipoint_pool3d/roipoint_pool3d.md @@ -281,7 +281,7 @@ void roipoint_pool3d_union1(const int batch_size, __memcpy_async(ping_input2, points_y_start + (bs_idx * pts_num) * sizeof(T), span_num_deal_size, GDRAM2NRAM); __memcpy_async(ping_input3, points_z_start + (bs_idx * pts_num) * sizeof(T), span_num_deal_size, GDRAM2NRAM); __memcpy_async(point_features, point_features_start, span_num_deal_size, GDRAM2NRAM); - __bang_write_zero((T *)cnt, boxes_num); + __bang_write_value((T *)cnt, boxes_num, (T)0);; size_t box_start = bs_idx == batch_start ? first_batch_box_start : 0; size_t box_end = bs_idx == batch_end ? last_batch_box_end : boxes_num; diff --git a/kernels/ball_query/ball_query_union1.mlu b/kernels/ball_query/ball_query_union1.mlu index 33c4b2bca..e7c1daa85 100644 --- a/kernels/ball_query/ball_query_union1.mlu +++ b/kernels/ball_query/ball_query_union1.mlu @@ -121,7 +121,7 @@ __mlu_func__ void ballQueryWorkflow( T *new_xyz_nram = vec_new_x1; __memcpy(new_xyz_nram, &new_xyz[base1], num_deal_new_xyz * 3 * sizeof(T), GDRAM2NRAM); - __bang_write_zero(vec_idx_num, num_stride); + __bang_write_value(vec_idx_num, num_stride, (int32_t)0); for (uint32_t new_index = index_new_xyz; new_index < (index_new_xyz + num_deal_new_xyz);) { diff --git a/kernels/box_iou_rotated/box_iou_rotated_aligned.h b/kernels/box_iou_rotated/box_iou_rotated_aligned.h index 67a948ffe..7bb94f192 100644 --- a/kernels/box_iou_rotated/box_iou_rotated_aligned.h +++ b/kernels/box_iou_rotated/box_iou_rotated_aligned.h @@ -62,7 +62,8 @@ __mlu_func__ void MLUUnion1BoxIouRotatedAligned(const T *box1, const T *box2, const uint32_t max_box_pair = FLOOR_ALIGN(MAX_NRAM_SIZE / copies_of_nram, COMPUTE_COUNT_ALIGN); // First, initialize ram with all 0, or could cause nan/inf unexcepted results - __bang_write_zero((uint8_t *)nram_buffer, copies_of_nram * max_box_pair); + __bang_write_value((uint8_t *)nram_buffer, copies_of_nram * max_box_pair, + (uint8_t)0); void *box1_trans = nram_buffer + 4 * max_box_pair * sizeof(T); void *box2_trans = @@ -224,8 +225,8 @@ __mlu_func__ void MLUUnion1BoxIouRotatedAligned(const T *box1, const T *box2, (T *)temp4_ram, (T *)temp5_ram, actual_compute_box_num); // initialize valid_pts, nums_in - __bang_write_zero((T *)valid_pts, 24 * actual_compute_box_num); - __bang_write_zero((T *)nums_in_ram, actual_compute_box_num); + __bang_write_value((T *)valid_pts, 24 * actual_compute_box_num, (T)0); + __bang_write_value((T *)nums_in_ram, actual_compute_box_num, (T)0); // 3. Get all intersection points getIntersectionPoints( diff --git a/kernels/box_iou_rotated/box_iou_rotated_nonaligned.h b/kernels/box_iou_rotated/box_iou_rotated_nonaligned.h index ec44b3abd..82db7292b 100644 --- a/kernels/box_iou_rotated/box_iou_rotated_nonaligned.h +++ b/kernels/box_iou_rotated/box_iou_rotated_nonaligned.h @@ -68,7 +68,8 @@ __mlu_func__ void MLUUnion1BoxIouRotatedNonAligned(const T *box1, const T *box2, const uint32_t max_box_pair = FLOOR_ALIGN(MAX_NRAM_SIZE / copies_of_nram, COMPUTE_COUNT_ALIGN); // First, initialize ram with all 0, or could cause nan/inf unexcepted results - __bang_write_zero((uint8_t *)nram_buffer, copies_of_nram * max_box_pair); + __bang_write_value((uint8_t *)nram_buffer, copies_of_nram * max_box_pair, + (uint8_t)0); void *box1_onchip = nram_buffer + 2 * max_box_pair * sizeof(T); void *box2_onchip = @@ -190,7 +191,7 @@ __mlu_func__ void MLUUnion1BoxIouRotatedNonAligned(const T *box1, const T *box2, const T area_thres = 1e-14; if (area1 < area_thres) { // set all current box-paires ious to zeros - __bang_write_zero((T *)ious_ram, actual_compute_box_num); + __bang_write_value((T *)ious_ram, actual_compute_box_num, (T)0); __memcpy(ious + current_ious_offset, (T *)ious_ram, actual_box2_num * sizeof(T), NRAM2GDRAM); continue; @@ -309,8 +310,8 @@ __mlu_func__ void MLUUnion1BoxIouRotatedNonAligned(const T *box1, const T *box2, (T *)temp3_ram, (T *)temp4_ram, actual_compute_box_num); - __bang_write_zero((T *)valid_pts, 24 * actual_compute_box_num); - __bang_write_zero((T *)nums_in_ram, actual_compute_box_num); + __bang_write_value((T *)valid_pts, 24 * actual_compute_box_num, (T)0); + __bang_write_value((T *)nums_in_ram, actual_compute_box_num, (T)0); // 3. Get all intersection points getIntersectionPoints( diff --git a/kernels/box_iou_rotated/box_iou_rotated_utils.h b/kernels/box_iou_rotated/box_iou_rotated_utils.h index 22aa3e0ec..e9194ef52 100644 --- a/kernels/box_iou_rotated/box_iou_rotated_utils.h +++ b/kernels/box_iou_rotated/box_iou_rotated_utils.h @@ -455,8 +455,8 @@ __mlu_func__ void convexHullGraham( // if all of boxes are invalid, just return. int valid_box_count = __bang_count((T *)valid_box, real_compute_box_num); if (!valid_box_count) { - __bang_write_value((T *)ordered_pts_x, total_points, 0); - __bang_write_value((T *)ordered_pts_y, total_points, 0); + __bang_write_value((T *)ordered_pts_x, total_points, (T)0); + __bang_write_value((T *)ordered_pts_y, total_points, (T)0); __bang_write_value((T *)valid_pts, actual_compute_box_num, (T)1); __bang_write_value((T *)valid_pts + actual_compute_box_num, total_points - actual_compute_box_num, (T)0); @@ -559,8 +559,8 @@ __mlu_func__ void convexHullGraham( // assign invalid value to temp1_ram(-2 < -1) and temp2_ram for sorting. __bang_write_value((T *)temp1_ram, actual_compute_box_num, (T)-2); __bang_write_value((T *)temp2_ram, actual_compute_box_num, (T)0); - __bang_write_value((T *)ordered_pts_x, total_points, 0); - __bang_write_value((T *)ordered_pts_y, total_points, 0); + __bang_write_value((T *)ordered_pts_x, total_points, (T)0); + __bang_write_value((T *)ordered_pts_y, total_points, (T)0); // get the offset of each max value according to the channel __mluop_get_stage_indices_tfuse((int *)temp3_ram, actual_compute_box_num); @@ -783,7 +783,7 @@ __mlu_func__ void polygonArea(T *ordered_pts_x, T *ordered_pts_y, T *valid_box, actual_compute_box_num); // temp1 = area, initialize with all 0 - __bang_write_zero((T *)temp1_ram, actual_compute_box_num); + __bang_write_value((T *)temp1_ram, actual_compute_box_num, (T)0); __bang_argmax((T *)temp6_ram, (T *)nums_in_ram, actual_compute_box_num); // temp_nums_in = max(nums_in) diff --git a/kernels/carafe/carafe_block.mlu b/kernels/carafe/carafe_block.mlu index c84572001..74eaed328 100644 --- a/kernels/carafe/carafe_block.mlu +++ b/kernels/carafe/carafe_block.mlu @@ -333,17 +333,17 @@ __mlu_global__ void MLUKernelCarafeBackward(T *input, T *mask, T *grad_output, h_k * wo * group * k_up * k_up + w_k * group * k_up * k_up + group_k * k_up * k_up; - __bang_write_zero((T *)nram_buf + 2 * NRAM_BLOCK / sizeof(T), - NRAM_BLOCK / sizeof(T)); - __bang_write_zero((T *)nram_buf + 4 * NRAM_BLOCK / sizeof(T), - NRAM_BLOCK / sizeof(T)); - __bang_write_zero((T *)nram_buf + 3 * NRAM_BLOCK / sizeof(T), - NRAM_BLOCK / sizeof(T)); + __bang_write_value((T *)nram_buf + 2 * NRAM_BLOCK / sizeof(T), + NRAM_BLOCK / sizeof(T), (T)0); + __bang_write_value((T *)nram_buf + 4 * NRAM_BLOCK / sizeof(T), + NRAM_BLOCK / sizeof(T), (T)0); + __bang_write_value((T *)nram_buf + 3 * NRAM_BLOCK / sizeof(T), + NRAM_BLOCK / sizeof(T), (T)0); __memcpy((T *)nram_buf + NRAM_BLOCK / sizeof(T), (T *)base_mask, k_up * k_up * sizeof(T), GDRAM2NRAM); for (int i = 0; i < num_per_loop; i++) { - __bang_write_zero((T *)nram_buf, NRAM_BLOCK / sizeof(T)); + __bang_write_value((T *)nram_buf, NRAM_BLOCK / sizeof(T), (T)0); T *base_grad_output = (T *)grad_output + n_k * ho * wo * c + h_k * wo * c + w_k * c + group_k * group_size + i * num_align; @@ -386,7 +386,7 @@ __mlu_global__ void MLUKernelCarafeBackward(T *input, T *mask, T *grad_output, } } if (rem_for_loop) { - __bang_write_zero((T *)nram_buf, NRAM_BLOCK / sizeof(T)); + __bang_write_value((T *)nram_buf, NRAM_BLOCK / sizeof(T), (T)0); T *base_grad_output = (T *)grad_output + n_k * ho * wo * c + h_k * wo * c + w_k * c + group_k * group_size + num_per_loop * num_align; diff --git a/kernels/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward_block.mlu b/kernels/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward_block.mlu index d9bb2bffa..e18e654b5 100644 --- a/kernels/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward_block.mlu +++ b/kernels/diff_iou_rotated_sort_vertices_forward/diff_iou_rotated_sort_vertices_forward_block.mlu @@ -132,7 +132,7 @@ static __mlu_func__ void compute(int8_t *nram_vertices, int8_t *nram_mask, deal_num * dim_m); // preprocess to get pad index - __bang_write_zero(nram_temp0, dim_m); + __bang_write_value(nram_temp0, dim_m, (T)0); __bang_write_value(nram_temp0, INTERSECTION_OFFSET, (T)1.0); __bang_int82float(nram_pad, (int8_t *)(nram_mask_p), deal_num * dim_m, 0); __bang_cycle_maxequal(nram_pad, nram_pad, nram_temp0, deal_num * dim_m, diff --git a/kernels/lgamma/lgamma_block.mlu b/kernels/lgamma/lgamma_block.mlu index f3366189c..2383f96d4 100644 --- a/kernels/lgamma/lgamma_block.mlu +++ b/kernels/lgamma/lgamma_block.mlu @@ -161,7 +161,7 @@ __mlu_func__ void calcLgamma(float *buf0, float *buf1, float *buf2, float *buf3, * reflection_denom - lgamma_x : -reflection_denom; */ // using buf3 -> reflection - __bang_write_zero(buf4, num_deal); + __bang_write_value(buf4, num_deal, (float)0); __bang_sub(buf2, buf4, buf2, num_deal); isFinite(buf4, buf2, num_deal); __bang_sub(buf3, buf2, buf1, num_deal); diff --git a/kernels/logspace/logspace.cpp b/kernels/logspace/logspace.cpp index d898dd65f..421a7c5b4 100644 --- a/kernels/logspace/logspace.cpp +++ b/kernels/logspace/logspace.cpp @@ -31,7 +31,7 @@ static void LogspacePolicyFunc(const mluOpHandle_t &handle, const int64_t steps, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) { - *k_type = CNRT_FUNC_TYPE_BLOCK; + *k_type = cnrtFuncTypeBlock; uint32_t cluster_num = mluop::runtime::getCoreNumOfEachUnionCapability(handle); uint32_t core_in_cluster = handle->core_num_per_cluster; diff --git a/kernels/logspace/logspace_block.mlu b/kernels/logspace/logspace_block.mlu index 625597f24..5e24191cb 100644 --- a/kernels/logspace/logspace_block.mlu +++ b/kernels/logspace/logspace_block.mlu @@ -60,16 +60,16 @@ __mlu_func__ void float2DifferentType(float *result_float, T *result, __bang_lut((int16_t *)result_ge_half_max, (uint16_t *)result_ge_half_max, (int16_t *)table_half_all1, num, LUT_TABEL_LENGTH); __bang_float2half_tz((half *)result, result_float, num); - __bang_bor((int16_t *)result, (int16_t *)result, - (int16_t *)result_ge_half_max, num); + __bang_bor((int8_t *)result, (int8_t *)result, (int8_t *)result_ge_half_max, + 2 * num); __bang_ge_scalar((int16_t *)result_ge_half_max, (int16_t *)result_ge_half_max, 1, num); __nram__ int16_t table_half_inf[LUT_TABEL_LENGTH] = {(int16_t)0xffff, (int16_t)0xfc00}; __bang_lut((int16_t *)result_ge_half_max, (uint16_t *)result_ge_half_max, (int16_t *)table_half_inf, num, LUT_TABEL_LENGTH); - __bang_band((int16_t *)result, (int16_t *)result, - (int16_t *)result_ge_half_max, num); + __bang_band((int8_t *)result, (int8_t *)result, + (int8_t *)result_ge_half_max, 2 * num); } if (std::is_same::value) { __cn_vector_cast_f32_to_s32(num, (int *)result, result_float); @@ -236,8 +236,10 @@ __mlu_func__ void dealBaseNegative(const float start, const float end, __bang_float2int32((int *)floor_y, floor_y, actual_deal_num, 0); __bang_move(y_copy, log2_result, sizeof(float) * actual_deal_num); __bang_float2int32((int *)y_copy, y_copy, actual_deal_num, 0); - __bang_band((int *)y_copy, (int *)y_copy, all_int_1, actual_deal_num); - __bang_band((int *)y_copy, (int *)y_copy, (int *)floor_y, actual_deal_num); + __bang_band((int8_t *)y_copy, (int8_t *)y_copy, (int8_t *)all_int_1, + 4 * actual_deal_num); + __bang_band((int8_t *)y_copy, (int8_t *)y_copy, (int8_t *)floor_y, + 4 * actual_deal_num); __nram__ uint32_t table_for_odd_or_even_power[LUT_TABEL_LENGTH] = { 0, 0x80000000}; __bang_lut((int32_t *)y_copy, (uint32_t *)y_copy, @@ -247,12 +249,12 @@ __mlu_func__ void dealBaseNegative(const float start, const float end, __bang_lut((int32_t *)floor_y, (uint32_t *)floor_y, (int32_t *)table_for_integer_power, actual_deal_num, LUT_TABEL_LENGTH); - __bang_bor((int *)log2_result, (int *)log2_result, (int *)floor_y, - actual_deal_num); + __bang_bor((int8_t *)log2_result, (int8_t *)log2_result, (int8_t *)floor_y, + 4 * actual_deal_num); __bang_mul_scalar(log2_result, log2_result, base_log, actual_deal_num); __bang_pow2(result_float, log2_result, actual_deal_num); - __bang_bor((int *)result_float, (int *)result_float, (int *)y_copy, - actual_deal_num); + __bang_bor((int8_t *)result_float, (int8_t *)result_float, (int8_t *)y_copy, + 4 * actual_deal_num); float2DifferentType(result_float, result, actual_deal_num); __memcpy(res + loop_offset, result, actual_deal_num * sizeof(T), NRAM2GDRAM); diff --git a/kernels/moe_dispatch/moe_dispatch_backward_gate/moe_dispatch_backward_gate_union1.mlu b/kernels/moe_dispatch/moe_dispatch_backward_gate/moe_dispatch_backward_gate_union1.mlu index e8e9ad0eb..734d680cc 100644 --- a/kernels/moe_dispatch/moe_dispatch_backward_gate/moe_dispatch_backward_gate_union1.mlu +++ b/kernels/moe_dispatch/moe_dispatch_backward_gate/moe_dispatch_backward_gate_union1.mlu @@ -183,7 +183,7 @@ __mlu_global__ void MLUKernelMoeDispatchBwdGate1( if ((samples < taskDim) && (taskId == 0)) { T *nram_grad_gates = (T *)nram_buffer; - __bang_write_zero(nram_grad_gates, samples); + __bang_write_value(nram_grad_gates, samples, (T)0); if (samples > 1) { int one_sample_task_num = taskDim / samples; @@ -285,7 +285,7 @@ __mlu_global__ void MLUKernelMoeDispatchBwdGate2( __bang_and(nram_mask, nram_mask, nram_indices, deal_s_num); T *nram_grad_gates = (T *)nram_indices; - __bang_write_zero(nram_grad_gates, deal_s_num); + __bang_write_value(nram_grad_gates, deal_s_num, 0); if (deal_s_num > 1) { T *base_dispatch_addr = (T *)dispatch; diff --git a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu index 9ff2a72e8..3795f41f0 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu +++ b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu @@ -370,7 +370,7 @@ void __mlu_func__ loadValue( b_col * spatial_size * qid_stride + level_start_id * qid_stride; } #endif - __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid); + __bang_write_value(grad_temp1, deal_num_real * num_deal_grid, (float)0); __bang_cycle_add(grad_temp1, grad_temp1, mask2, deal_num_real * num_deal_grid, num_deal_grid); __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid); @@ -379,7 +379,7 @@ void __mlu_func__ loadValue( num_deal_grid * deal_num_real, 0); __bang_lut((int32_t *)grad_temp3, (uint32_t *)grad_temp3, (int32_t *)table, num_deal_grid * deal_num_real, 64); - __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid); + __bang_write_value(grad_temp1, deal_num_real * num_deal_grid, (float)0); __bang_cycle_add(grad_temp1, grad_temp1, mask1, deal_num_real * num_deal_grid, num_deal_grid); __sync_io_move_compute(); @@ -397,7 +397,7 @@ void __mlu_func__ loadValue( (int8_t *)grad_temp3, num_deal_grid * deal_num_real * sizeof(float)); - __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid); + __bang_write_value(grad_temp1, deal_num_real * num_deal_grid, (float)0); __bang_cycle_add(grad_temp1, grad_temp1, mask4, deal_num_real * num_deal_grid, num_deal_grid); __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid); @@ -410,7 +410,7 @@ void __mlu_func__ loadValue( (int8_t *)grad_temp3, num_deal_grid * deal_num_real * sizeof(float)); - __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid); + __bang_write_value(grad_temp1, deal_num_real * num_deal_grid, (float)0); __bang_cycle_add(grad_temp1, grad_temp1, mask3, deal_num_real * num_deal_grid, num_deal_grid); __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid); @@ -438,7 +438,7 @@ void __mlu_func__ computeGradValue( float *nram_grid_offset2, const int32_t &batch, float *nram_grad_output_tl, float *nram_grad_output_tr, float *nram_grad_output_bl, float *nram_grad_output_br, float *nram_grad_weight) { - __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid); + __bang_write_value(grad_temp1, deal_num_real * num_deal_grid, (float)0); __bang_cycle_add(grad_temp1, grad_temp1, nram_grad_weight, deal_num_real * num_deal_grid, num_deal_grid); __bang_transpose(grad_temp3, grad_temp1, @@ -600,7 +600,7 @@ void __mlu_func__ computeGradAttnWeight( const int32_t &num_per_time_real, const int32_t &num_heads, const int32_t &num_levels, const int32_t &num_points, const int32_t &grid_offset, float *nram_h_high_temp) { - __bang_write_zero(grad_w_weight, 2 * offset_nram); + __bang_write_value(grad_w_weight, 2 * offset_nram, (float)0); // grad_output_nram_tl __bang_transpose(grad_weight, nram_grad_output_tl, num_deal_grid, deal_num_real); @@ -714,7 +714,7 @@ void __mlu_func__ computeGradSampingLoc( num_points * deal_num_real, num_per_time_real * num_heads * num_levels); - __bang_write_zero(grad_temp1, num_deal_grid * deal_num_real); + __bang_write_value(grad_temp1, num_deal_grid * deal_num_real, (float)0); __bang_cycle_add(grad_temp1, grad_temp1, nram_grad_weight, num_deal_grid * deal_num_real, num_deal_grid); __bang_transpose(nram_grad_output_tr, grad_temp1, diff --git a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_union1.mlu b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_union1.mlu index 66505f6c0..774b588d9 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_union1.mlu +++ b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_union1.mlu @@ -231,9 +231,11 @@ __mlu_global__ void MLUUnion1KernelMsDeformAttnBackwardDefault( const float w4 = lh * lw; if (likely(C_tail != 0)) { const int32_t base_ptr = m_col * channels + C_repeat * deal_num; - __bang_write_zero(grad_h_weight, PAD_UP(channels, ALIGN_NUM)); - __bang_write_zero(grad_w_weight, PAD_UP(channels, ALIGN_NUM)); - __bang_write_zero(grad_output_nram, PAD_UP(channels, ALIGN_NUM)); + __bang_write_value(grad_h_weight, PAD_UP(channels, ALIGN_NUM), + (float)0); + __bang_write_value(grad_w_weight, PAD_UP(channels, ALIGN_NUM), + (float)0); + __bang_write_value(grad_output_nram, PAD_UP(channels, ALIGN_NUM), 0); __memcpy(top_grad, grad_output + grad_output_offset + C_repeat * deal_num, @@ -250,9 +252,12 @@ __mlu_global__ void MLUUnion1KernelMsDeformAttnBackwardDefault( } for (int32_t C_loop = 0; C_loop < C_repeat; ++C_loop) { const int32_t base_ptr = m_col * channels + C_loop * deal_num; - __bang_write_zero(grad_h_weight, PAD_UP(channels, ALIGN_NUM)); - __bang_write_zero(grad_w_weight, PAD_UP(channels, ALIGN_NUM)); - __bang_write_zero(grad_output_nram, PAD_UP(channels, ALIGN_NUM)); + __bang_write_value(grad_h_weight, PAD_UP(channels, ALIGN_NUM), + (float)0); + __bang_write_value(grad_w_weight, PAD_UP(channels, ALIGN_NUM), + (float)0); + __bang_write_value(grad_output_nram, PAD_UP(channels, ALIGN_NUM), + (float)0); __memcpy(top_grad, grad_output + grad_output_offset + C_loop * deal_num, deal_num * LEN_FLOAT, GDRAM2NRAM); diff --git a/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu b/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu index e1abf634d..fa324ca94 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu +++ b/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu @@ -418,7 +418,7 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardSmallChannel( c_real_num = c_rem; } } - __bang_write_zero((float *)input_tl, 4 * deal_num * channel); + __bang_write_value((float *)input_tl, 4 * deal_num * channel, (float)0); __sync(); // load data_value for (int32_t p_idx = 0; p_idx < io_data_num; ++p_idx) { diff --git a/kernels/mutual_information/mutual_information_backward/mutual_information_backward_3pipeline_block.mlu b/kernels/mutual_information/mutual_information_backward/mutual_information_backward_3pipeline_block.mlu index 11246d931..55a4ca8e7 100644 --- a/kernels/mutual_information/mutual_information_backward/mutual_information_backward_3pipeline_block.mlu +++ b/kernels/mutual_information/mutual_information_backward/mutual_information_backward_3pipeline_block.mlu @@ -116,11 +116,11 @@ __mlu_func__ void computePGrad(const int b, const int S, const int T, float *nram_cur_term2 = nram_cur_term1 + min_len + 1; float *nram_cur_p_grad = nram_cur_term2 + min_len; - __bang_write_zero(nram_cur_term1, 3 * min_len + 1); + __bang_write_value(nram_cur_term1, 3 * min_len + 1, (float)0); // compute the last one: p_grad[b][s_end][t_end] = ans_grad[b] - __memcpy(nram_p_grad + s_end * (T + 1) + t_end, ans_grad + b, - sizeof(float), GDRAM2NRAM); + __memcpy(nram_p_grad + s_end * (T + 1) + t_end, ans_grad + b, sizeof(float), + GDRAM2NRAM); nram_cur_p_grad[0] = nram_p_grad[s_end * (T + 1) + t_end]; int data_num = 0; @@ -242,7 +242,8 @@ __mlu_global__ void mluBlock3PipelineMutualInformationBackward( t_begin = boundary[1]; s_end = boundary[2]; t_end = boundary[3]; - __bang_write_zero((float *)nram_buffer, S * (T + 1) + (S + 1) * T); + __bang_write_value((float *)nram_buffer, S * (T + 1) + (S + 1) * T, + (float)0); if (s_begin > s_end || t_begin > t_end) { if (S > 0) { diff --git a/kernels/nms_rotated/nms_rotated_union1.mlu b/kernels/nms_rotated/nms_rotated_union1.mlu index d384cce4b..de5ab33c1 100644 --- a/kernels/nms_rotated/nms_rotated_union1.mlu +++ b/kernels/nms_rotated/nms_rotated_union1.mlu @@ -150,7 +150,8 @@ __mlu_func__ void nms_detection( void *vec2_y = (float *)vec2_x + 4 * max_seg_iou_pad; // First, initialize ram with all 0, or could cause nan/inf unexcepted results - __bang_write_zero((uint8_t *)score, copies_of_nram * max_seg_iou_pad); + __bang_write_value((uint8_t *)score, copies_of_nram * max_seg_iou_pad, + (uint8_t)0); for (int keep = 0; keep < input_box_num; keep++) { __sync_cluster(); @@ -270,7 +271,8 @@ __mlu_func__ void nms_detection( // Initialize valid_box, set actual_box_num boxes to 1, else set to 0 __bang_write_value(((float *)valid_box), seg_len, 1.0f); if (cpy_len < seg_len) { - __bang_write_zero((float *)valid_box + cpy_len, seg_len - cpy_len); + __bang_write_value((float *)valid_box + cpy_len, seg_len - cpy_len, + (float)0); } // Each box data: x, y, w, h, a @@ -323,8 +325,8 @@ __mlu_func__ void nms_detection( (float *)temp3_ram, (float *)temp4_ram, seg_len); // initialize valid_pts, nums_in - __bang_write_zero((float *)valid_pts, 24 * seg_len); - __bang_write_zero((float *)nums_in_ram, seg_len); + __bang_write_value((float *)valid_pts, 24 * seg_len, (float)0); + __bang_write_value((float *)nums_in_ram, seg_len, (float)0); // 3. Get all intersection points getIntersectionPoints( diff --git a/kernels/nms_rotated/nms_utils.h b/kernels/nms_rotated/nms_utils.h index 23a7b6434..c06cdc15c 100644 --- a/kernels/nms_rotated/nms_utils.h +++ b/kernels/nms_rotated/nms_utils.h @@ -840,7 +840,7 @@ __mlu_func__ void polygonArea(T *ordered_pts_x, T *ordered_pts_y, T *valid_box, actual_compute_box_num); // temp1 = area, initialize with all 0 - __bang_write_zero((T *)temp1_ram, actual_compute_box_num); + __bang_write_value((T *)temp1_ram, actual_compute_box_num, (T)0); __bang_argmax((T *)temp6_ram, (T *)nums_in_ram, actual_compute_box_num); // temp_nums_in = max(nums_in) diff --git a/kernels/prior_box/prior_box_block.mlu b/kernels/prior_box/prior_box_block.mlu index 6c3bc42bd..082965ad2 100644 --- a/kernels/prior_box/prior_box_block.mlu +++ b/kernels/prior_box/prior_box_block.mlu @@ -33,7 +33,7 @@ __nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; template __mlu_func__ void generate_AbAb_Mask(T *a_mask, T a_index, T *b_mask, T b_index, T *tmp, T *result, const int align_num) { - __bang_write_zero(result, align_num); + __bang_write_value(result, align_num, (T)0); __bang_mul_scalar(tmp, a_mask, a_index, align_num); __bang_add(result, result, tmp, align_num); __bang_mul_scalar(tmp, b_mask, b_index, align_num); @@ -79,7 +79,7 @@ __mlu_global__ void mluKernelPriorVar(const int height, const int width, const int loop_pixel_num = pixel_end_index - pixel_begin_index; T *loop_gdram_ptr = var + pixel_begin_index * num_priors * 4; __gdramset(loop_gdram_ptr, loop_pixel_num * num_priors * 4, (T)0); - __bang_write_zero(var_nram, loop_pixel_num * one_var_size); + __bang_write_value(var_nram, loop_pixel_num * one_var_size, (T)0); __bang_cycle_add(var_nram, var_nram, variances_nram, loop_pixel_num * one_var_size, one_var_size); // memcpy to gdram diff --git a/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu b/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu index 5cc589956..2f33e9032 100644 --- a/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu +++ b/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu @@ -433,7 +433,7 @@ __mlu_global__ void roiAlignRotatedForward( if (cur_cache_c + c_cache_i > channels) { cur_cache_c = channels - c_cache_i; } - __bang_write_zero(output_channels, cur_cache_c); + __bang_write_value(output_channels, cur_cache_c, (T)0); for (uint32_t h_idx = 0; h_idx < roi_bin_grid_h; h_idx += bin_order_num) { uint32_t deal_bin_h = bin_order_num; diff --git a/kernels/roipoint_pool3d/roipoint_pool3d_union1.mlu b/kernels/roipoint_pool3d/roipoint_pool3d_union1.mlu index ab06ec577..0035ce1f8 100644 --- a/kernels/roipoint_pool3d/roipoint_pool3d_union1.mlu +++ b/kernels/roipoint_pool3d/roipoint_pool3d_union1.mlu @@ -217,8 +217,8 @@ __mlu_func__ void computeStoreLastBlockRoipointPool3d( int rem = (sampled_pts_num * (3 + feature_in_len)) % (auxiliary_num_deal * 6); // use auxiliary_a to auxiliary_f - __bang_write_zero((T *)auxiliary_a, - PAD_UP(auxiliary_num_deal * 6, NFU_ALIGN_SIZE)); + __bang_write_value((T *)auxiliary_a, + PAD_UP(auxiliary_num_deal * 6, NFU_ALIGN_SIZE), (T)0); if (repeat > 0) { __memcpy(pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T), @@ -377,7 +377,8 @@ __mlu_global__ void MLUKernelRoipointPool3d( for (int bs_idx = batch_start; bs_idx <= batch_end; bs_idx++) { __memcpy_async(boxes3d, boxes3d_gdram + bs_idx * boxes_num * 7 * sizeof(T), boxes_num * 7 * sizeof(T), GDRAM2NRAM); - __bang_write_zero((int *)cnt, PAD_UP(boxes_num, NFU_ALIGN_SIZE)); + __bang_write_value((int *)cnt, PAD_UP(boxes_num, NFU_ALIGN_SIZE), + (int32_t)0); const int8_t *points_x_start = points_x_gdram + bs_idx * pts_num * sizeof(T); diff --git a/kernels/roipoint_pool3d/roipoint_pool3d_union1_large_boxes_num.mlu b/kernels/roipoint_pool3d/roipoint_pool3d_union1_large_boxes_num.mlu index fe2631d95..7e68a20cb 100644 --- a/kernels/roipoint_pool3d/roipoint_pool3d_union1_large_boxes_num.mlu +++ b/kernels/roipoint_pool3d/roipoint_pool3d_union1_large_boxes_num.mlu @@ -213,8 +213,8 @@ __mlu_func__ void computeStoreLastBlockRoipointPool3d( int rem = (sampled_pts_num * (3 + feature_in_len)) % (auxiliary_num_deal * 6); // use auxiliary_a to auxiliary_f - __bang_write_zero((T *)auxiliary_a, - PAD_UP(auxiliary_num_deal * 6, NFU_ALIGN_SIZE)); + __bang_write_value((T *)auxiliary_a, + PAD_UP(auxiliary_num_deal * 6, NFU_ALIGN_SIZE), (T)0); if (repeat > 0) { __memcpy(pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T), diff --git a/kernels/three_interpolate/three_interpolate_union1.mlu b/kernels/three_interpolate/three_interpolate_union1.mlu index 1d0ba2e0e..1d0c8aa36 100644 --- a/kernels/three_interpolate/three_interpolate_union1.mlu +++ b/kernels/three_interpolate/three_interpolate_union1.mlu @@ -307,7 +307,8 @@ __mlu_global__ void MLUKernelThreeInterpolateForward( // transpose the indices and weights for (uint32_t index = 0; index < INDEX_WEIGHT_LAST_DIM; ++index) { __bang_write_value(nram_indices_transpose + index * n_limit, n_limit, -1); - __bang_write_zero(nram_weights_transpose + index * n_limit, n_limit); + __bang_write_value(nram_weights_transpose + index * n_limit, n_limit, + (T)0); __memcpy(nram_indices_transpose + index * n_limit, nram_indices + index, sizeof(int32_t), NRAM2NRAM, sizeof(int32_t), INDEX_WEIGHT_LAST_DIM * sizeof(int32_t), actual_n_size - 1); @@ -330,7 +331,7 @@ __mlu_global__ void MLUKernelThreeInterpolateForward( MIN(CEIL_ALIGN(c_slice % c_limit, align_base_128), c_limit_new); } // 1.2 load Co*Mo features data - __bang_write_zero(nram_output, output_deal_size); + __bang_write_value(nram_output, output_deal_size, (T)0); uint32_t m_rem = m; for (uint32_t k = 0; k < m_repeated_times; ++k) { uint32_t m_slice = m_limit < m_rem ? m_limit : m_rem; @@ -355,8 +356,8 @@ __mlu_global__ void MLUKernelThreeInterpolateForward( m_slice, GDRAM2NRAM, m_limit_new, m, c_slice - 1); } // 2. Compute - __bang_write_zero(nram_features_transpose, - features_deal_size + c_limit); + __bang_write_value(nram_features_transpose, + features_deal_size + c_limit, (T)0); c_limit = c_limit_new; m_limit = m_limit_new; // 2.1 transpose features from Co*Mo to Mo*Co to easily select one whole @@ -366,8 +367,8 @@ __mlu_global__ void MLUKernelThreeInterpolateForward( uint32_t m_min = k * m_limit_org; uint32_t m_max = m_min + m_slice; for (uint32_t index = 0; index < INDEX_WEIGHT_LAST_DIM; ++index) { - __bang_write_zero(nram_features, output_deal_size); - __bang_write_zero(nram_features_selected, output_deal_size); + __bang_write_value(nram_features, output_deal_size, (T)0); + __bang_write_value(nram_features_selected, output_deal_size, (T)0); // 2.2 select the offset between the m_min and m_max // convert indices from int32_t to float if (m <= INT2FLOAT_KEEP_PRECISION_MAX_VALUE) { @@ -542,7 +543,7 @@ __mlu_global__ void MLUKernelThreeInterpolateBackward( MIN(CEIL_ALIGN(c_slice % c_limit, align_base_128), c_limit_new); } // initial the nram_grad_features with 0 - __bang_write_zero(nram_grad_features, grad_features_deal_size); + __bang_write_value(nram_grad_features, grad_features_deal_size, (T)0); uint32_t n_rem = n; for (uint32_t k = 0; k < n_repeated_times; ++k) { uint32_t n_slice = n_limit < n_rem ? n_limit : n_rem; @@ -579,7 +580,8 @@ __mlu_global__ void MLUKernelThreeInterpolateBackward( for (uint32_t index = 0; index < INDEX_WEIGHT_LAST_DIM; ++index) { __bang_write_value(nram_indices_transpose + index * n_limit, n_limit, -1); - __bang_write_zero(nram_weights_transpose + index * n_limit, n_limit); + __bang_write_value(nram_weights_transpose + index * n_limit, n_limit, + (T)0); __memcpy(nram_indices_transpose + index * n_limit_new, nram_indices + index, sizeof(int32_t), NRAM2NRAM, sizeof(int32_t), INDEX_WEIGHT_LAST_DIM * sizeof(int32_t), @@ -595,8 +597,8 @@ __mlu_global__ void MLUKernelThreeInterpolateBackward( // initial nram_grad_output_transpose with zero // and set extra c_limit size that will be selected by the index not in // [m_min, m_max) - __bang_write_zero(nram_grad_output_transpose, - grad_output_deal_size + c_limit); + __bang_write_value(nram_grad_output_transpose, + grad_output_deal_size + c_limit, (T)0); c_limit = c_limit_new; n_limit = n_limit_new; for (uint32_t index = 0; index < INDEX_WEIGHT_LAST_DIM; ++index) { diff --git a/kernels/tin_shift/tin_shift_union1.mlu b/kernels/tin_shift/tin_shift_union1.mlu index 95dc2216a..c34f79484 100644 --- a/kernels/tin_shift/tin_shift_union1.mlu +++ b/kernels/tin_shift/tin_shift_union1.mlu @@ -40,7 +40,7 @@ __mlu_func__ void mluMultiKernelTinShift( int t_shift = shifts[n_index * group_size + group_id]; int index = cur_channel_index % channel_size * hw_size + n_index * time_size * channel_size * hw_size; - __bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0); + __bang_write_value(data_nram, MAX_NRAM_SIZE, (int8_t)0); __asm__ volatile("sync;"); if (abs(t_shift) >= time_size) { __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, @@ -123,7 +123,7 @@ __mlu_func__ void mluMultiKernelTinShiftSplitSequence( int next_sequence_index = index / hw_size / channel_size % time_size + segmentime_size; int cur_sequence_index = index / hw_size / channel_size % time_size; - __bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0); + __bang_write_value(data_nram, MAX_NRAM_SIZE, (int8_t)0); __asm__ volatile("sync;"); if (max_number_hw_per_core == 0) { mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index, diff --git a/kernels/voxelization/voxelization_kernel.mlu b/kernels/voxelization/voxelization_kernel.mlu index 9832ab4bf..82efbc1cd 100644 --- a/kernels/voxelization/voxelization_kernel.mlu +++ b/kernels/voxelization/voxelization_kernel.mlu @@ -502,7 +502,7 @@ __mlu_global__ void mluCalcPointsPerVoxel( // generate 0~deal_num indices. __mluop_get_stage_indices_tfuse(nram_base_offset, max_nram_count); - __bang_write_zero(nram_temp_zeros, max_nram_count); + __bang_write_value(nram_temp_zeros, max_nram_count, (int32_t)0); for (int32_t i = 0; i <= repeat; i++) { if (i == repeat && rem == 0) { break; diff --git a/kernels/yolo_box/yolo_box_block.mlu b/kernels/yolo_box/yolo_box_block.mlu index b258351ab..65a65fed3 100644 --- a/kernels/yolo_box/yolo_box_block.mlu +++ b/kernels/yolo_box/yolo_box_block.mlu @@ -217,7 +217,7 @@ __mlu_func__ void compute(T *nram_x, T *nram_y, T *nram_w, T *nram_h, if (clip_bbox == true) { // bx0 = bx0 > 0 ? bx0 : 0; // by0 = by0 > 0 ? by0 : 0; - __bang_write_zero(nram_conf_p, deal_num); + __bang_write_value(nram_conf_p, deal_num, (T)0); __bang_maxequal(nram_x_p, nram_conf_p, nram_x_p, deal_num); __bang_maxequal(nram_y_p, nram_conf_p, nram_y_p, deal_num); @@ -406,16 +406,16 @@ __mlu_func__ void YoloBoxComputeBbox( n_in, class_num, anchor_s, anchor_s, 0, c_in, hw_total_num, hw_seg_num, align_hw_seg_num, 0, 0); - __bang_write_zero(nram_cx, deal_num); - __bang_write_zero(nram_cy, deal_num); + __bang_write_value(nram_cx, deal_num, (T)0); + __bang_write_value(nram_cy, deal_num, (T)0); initCxyParam(nram_cx, nram_cy, n_in, anchor_s, w_in, hw_seg_num, align_hw_seg_num, hw_data_offset); - __bang_write_zero(nram_anchor_w, deal_num); - __bang_write_zero(nram_anchor_h, deal_num); + __bang_write_value(nram_anchor_w, deal_num, 0); + __bang_write_value(nram_anchor_h, deal_num, 0); initAnchorParam(anchors, nram_anchor_w, nram_anchor_h, n_in, anchor_s, anchor_s, 0, align_hw_seg_num); - __bang_write_zero(nram_img_w, deal_num); - __bang_write_zero(nram_img_h, deal_num); + __bang_write_value(nram_img_w, deal_num, 0); + __bang_write_value(nram_img_h, deal_num, 0); initImgParam(img_size, nram_img_w, nram_img_h, n_in, anchor_s, align_hw_seg_num); __sync(); @@ -440,12 +440,12 @@ __mlu_func__ void YoloBoxComputeBbox( T *base_addr_x = (T *)x + hw_data_offset; T *base_addr_boxes = (T *)boxes + hw_data_offset; - __bang_write_zero(nram_cx, deal_num); - __bang_write_zero(nram_cy, deal_num); + __bang_write_value(nram_cx, deal_num, (T)0); + __bang_write_value(nram_cy, deal_num, (T)0); initCxyParam(nram_cx, nram_cy, deal_n_num, anchor_s, w_in, hw_seg_num, align_hw_seg_num, hw_data_offset); - __bang_write_zero(nram_anchor_w, deal_num); - __bang_write_zero(nram_anchor_h, deal_num); + __bang_write_value(nram_anchor_w, deal_num, 0); + __bang_write_value(nram_anchor_h, deal_num, 0); initAnchorParam(anchors, nram_anchor_w, nram_anchor_h, deal_n_num, anchor_s, anchor_s, 0, align_hw_seg_num); @@ -467,8 +467,8 @@ __mlu_func__ void YoloBoxComputeBbox( // C int *addr_img_size = (int *)img_size; - __bang_write_zero(nram_img_w, deal_num); - __bang_write_zero(nram_img_h, deal_num); + __bang_write_value(nram_img_w, deal_num, 0); + __bang_write_value(nram_img_h, deal_num, 0); initImgParam(addr_img_size, nram_img_w, nram_img_h, deal_n_num, anchor_s, align_hw_seg_num); compute(nram_x, nram_y, nram_w, nram_h, nram_conf, nram_iou, nram_cx, @@ -494,8 +494,8 @@ __mlu_func__ void YoloBoxComputeBbox( // C int *addr_img_size = (int *)img_size + (n_iter + 1) * deal_n_num * 2; - __bang_write_zero(nram_img_w, deal_num); - __bang_write_zero(nram_img_h, deal_num); + __bang_write_value(nram_img_w, deal_num, 0); + __bang_write_value(nram_img_h, deal_num, 0); initImgParam(addr_img_size, nram_img_w, nram_img_h, deal_n_num, anchor_s, align_hw_seg_num); compute(nram_x, nram_y, nram_w, nram_h, nram_conf, nram_iou, nram_cx, @@ -525,8 +525,8 @@ __mlu_func__ void YoloBoxComputeBbox( if (repeat_n > 0) { // C int *addr_img_size = (int *)img_size + (repeat_n - 1) * deal_n_num * 2; - __bang_write_zero(nram_img_w, deal_num); - __bang_write_zero(nram_img_h, deal_num); + __bang_write_value(nram_img_w, deal_num, 0); + __bang_write_value(nram_img_h, deal_num, 0); initImgParam(addr_img_size, nram_img_w, nram_img_h, deal_n_num, anchor_s, align_hw_seg_num); compute(nram_x, nram_y, nram_w, nram_h, nram_conf, nram_iou, nram_cx, @@ -548,8 +548,8 @@ __mlu_func__ void YoloBoxComputeBbox( if (rem_n_num > 0) { // C int *addr_img_size = (int *)img_size + repeat_n * deal_n_num * 2; - __bang_write_zero(nram_img_w, deal_num); - __bang_write_zero(nram_img_h, deal_num); + __bang_write_value(nram_img_w, deal_num, 0); + __bang_write_value(nram_img_h, deal_num, 0); initImgParam(addr_img_size, nram_img_w, nram_img_h, rem_n_num, anchor_s, align_hw_seg_num); compute(nram_x, nram_y, nram_w, nram_h, nram_conf, nram_iou, nram_cx, @@ -575,8 +575,8 @@ __mlu_func__ void YoloBoxComputeBbox( T *base_addr_x = (T *)x + hw_data_offset; T *base_addr_boxes = (T *)boxes + hw_data_offset; - __bang_write_zero(nram_cx, deal_num); - __bang_write_zero(nram_cy, deal_num); + __bang_write_value(nram_cx, deal_num, (T)0); + __bang_write_value(nram_cy, deal_num, (T)0); initCxyParam(nram_cx, nram_cy, 1, deal_s_num, w_in, hw_seg_num, align_hw_seg_num, hw_data_offset); @@ -617,14 +617,14 @@ __mlu_func__ void YoloBoxComputeBbox( } // C - __bang_write_zero(nram_anchor_w, deal_num); - __bang_write_zero(nram_anchor_h, deal_num); + __bang_write_value(nram_anchor_w, deal_num, 0); + __bang_write_value(nram_anchor_h, deal_num, 0); initAnchorParam(anchors, nram_anchor_w, nram_anchor_h, 1, anchor_s, deal_s_num, 0, align_hw_seg_num); int *addr_img_size = (int *)img_size; - __bang_write_zero(nram_img_w, deal_num); - __bang_write_zero(nram_img_h, deal_num); + __bang_write_value(nram_img_w, deal_num, 0); + __bang_write_value(nram_img_h, deal_num, 0); initImgParam(addr_img_size, nram_img_w, nram_img_h, 1, deal_s_num, align_hw_seg_num); compute(nram_x, nram_y, nram_w, nram_h, nram_conf, nram_iou, nram_cx, @@ -671,8 +671,8 @@ __mlu_func__ void YoloBoxComputeBbox( } // C - __bang_write_zero(nram_anchor_w, deal_num); - __bang_write_zero(nram_anchor_h, deal_num); + __bang_write_value(nram_anchor_w, deal_num, 0); + __bang_write_value(nram_anchor_h, deal_num, 0); int anchor_offset = (ns_iter + 1) * deal_s_num; initAnchorParam(anchors, nram_anchor_w, nram_anchor_h, 1, anchor_s, deal_s_num, anchor_offset, align_hw_seg_num); @@ -684,8 +684,8 @@ __mlu_func__ void YoloBoxComputeBbox( anchor_num = anchor_s - s_num_offset; next_batch = deal_s_num > anchor_num; anchor_num = next_batch ? anchor_num : deal_s_num; - __bang_write_zero(nram_img_w, deal_num); - __bang_write_zero(nram_img_h, deal_num); + __bang_write_value(nram_img_w, deal_num, 0); + __bang_write_value(nram_img_h, deal_num, 0); initImgParam(addr_img_size, nram_img_w, nram_img_h, 1, anchor_num, align_hw_seg_num); @@ -728,8 +728,8 @@ __mlu_func__ void YoloBoxComputeBbox( } if (repeat_ns > 0) { // C - __bang_write_zero(nram_anchor_w, deal_num); - __bang_write_zero(nram_anchor_h, deal_num); + __bang_write_value(nram_anchor_w, deal_num, 0); + __bang_write_value(nram_anchor_h, deal_num, 0); int anchor_offset = (repeat_ns - 1) * deal_s_num; initAnchorParam(anchors, nram_anchor_w, nram_anchor_h, 1, anchor_s, deal_s_num, anchor_offset, align_hw_seg_num); @@ -741,8 +741,8 @@ __mlu_func__ void YoloBoxComputeBbox( int anchor_num = anchor_s - s_num_offset; bool next_batch = deal_s_num > anchor_num; anchor_num = next_batch ? anchor_num : deal_s_num; - __bang_write_zero(nram_img_w, deal_num); - __bang_write_zero(nram_img_h, deal_num); + __bang_write_value(nram_img_w, deal_num, 0); + __bang_write_value(nram_img_h, deal_num, 0); initImgParam(addr_img_size, nram_img_w, nram_img_h, 1, anchor_num, align_hw_seg_num); @@ -774,8 +774,8 @@ __mlu_func__ void YoloBoxComputeBbox( } if (rem_ns_num > 0) { // C - __bang_write_zero(nram_anchor_w, deal_num); - __bang_write_zero(nram_anchor_h, deal_num); + __bang_write_value(nram_anchor_w, deal_num, 0); + __bang_write_value(nram_anchor_h, deal_num, 0); int anchor_offset = repeat_ns * deal_s_num; initAnchorParam(anchors, nram_anchor_w, nram_anchor_h, 1, anchor_s, rem_ns_num, anchor_offset, align_hw_seg_num); @@ -783,8 +783,8 @@ __mlu_func__ void YoloBoxComputeBbox( // init img w/h int batch_num = anchor_offset / anchor_s; int *addr_img_size = (int *)img_size + batch_num * 2; - __bang_write_zero(nram_img_w, deal_num); - __bang_write_zero(nram_img_h, deal_num); + __bang_write_value(nram_img_w, deal_num, 0); + __bang_write_value(nram_img_h, deal_num, 0); initImgParam(addr_img_size, nram_img_w, nram_img_h, 1, rem_ns_num, align_hw_seg_num); @@ -817,13 +817,13 @@ __mlu_func__ void YoloBoxComputeBbox( T *addr_boxes_n = base_addr_boxes + n_iter * output_stride + s_iter * 4 * hw_total_num; - __bang_write_zero(nram_anchor_w, deal_num); - __bang_write_zero(nram_anchor_h, deal_num); + __bang_write_value(nram_anchor_w, deal_num, 0); + __bang_write_value(nram_anchor_h, deal_num, 0); initAnchorParam(anchors, nram_anchor_w, nram_anchor_h, 1, anchor_s, 1, s_iter, deal_num); - __bang_write_zero(nram_img_w, deal_num); - __bang_write_zero(nram_img_h, deal_num); + __bang_write_value(nram_img_w, deal_num, 0); + __bang_write_value(nram_img_h, deal_num, 0); int *addr_img_size = (int *)img_size + n_iter * 2; initImgParam(addr_img_size, nram_img_w, nram_img_h, 1, 1, deal_num); @@ -844,8 +844,8 @@ __mlu_func__ void YoloBoxComputeBbox( deal_num, deal_num, nram_pingpong_num, 1); // C - __bang_write_zero(nram_cx, deal_num); - __bang_write_zero(nram_cy, deal_num); + __bang_write_value(nram_cx, deal_num, (T)0); + __bang_write_value(nram_cy, deal_num, (T)0); int hw_offset = hw_data_offset; initCxyParam(nram_cx, nram_cy, 1, 1, w_in, deal_num, deal_num, hw_offset); @@ -870,8 +870,8 @@ __mlu_func__ void YoloBoxComputeBbox( deal_num, deal_num, nram_pingpong_num, hw_iter + 2); // C - __bang_write_zero(nram_cx, deal_num); - __bang_write_zero(nram_cy, deal_num); + __bang_write_value(nram_cx, deal_num, (T)0); + __bang_write_value(nram_cy, deal_num, (T)0); int hw_offset = hw_data_offset + (hw_iter + 1) * deal_num; initCxyParam(nram_cx, nram_cy, 1, 1, w_in, deal_num, deal_num, hw_offset); @@ -899,8 +899,8 @@ __mlu_func__ void YoloBoxComputeBbox( } if (repeat_hw > 0) { // C - __bang_write_zero(nram_cx, deal_num); - __bang_write_zero(nram_cy, deal_num); + __bang_write_value(nram_cx, deal_num, (T)0); + __bang_write_value(nram_cy, deal_num, (T)0); int hw_offset = hw_data_offset + (repeat_hw - 1) * deal_num; initCxyParam(nram_cx, nram_cy, 1, 1, w_in, deal_num, deal_num, hw_offset); @@ -921,8 +921,8 @@ __mlu_func__ void YoloBoxComputeBbox( } if (rem_hw_num > 0) { // C - __bang_write_zero(nram_cx, deal_num); - __bang_write_zero(nram_cy, deal_num); + __bang_write_value(nram_cx, deal_num, (T)0); + __bang_write_value(nram_cy, deal_num, (T)0); int hw_offset = hw_data_offset + repeat_hw * deal_num; initCxyParam(nram_cx, nram_cy, 1, 1, w_in, rem_hw_num, deal_num, hw_offset); diff --git a/test/mlu_op_gtest/pb_gtest/include/runtime.h b/test/mlu_op_gtest/pb_gtest/include/runtime.h index 9bc9fdeaa..6d5bc22f7 100644 --- a/test/mlu_op_gtest/pb_gtest/include/runtime.h +++ b/test/mlu_op_gtest/pb_gtest/include/runtime.h @@ -38,7 +38,7 @@ #include "memory_pool.h" #ifndef CNRT_RET_ERR_INVALID -#define CNRT_RET_ERR_INVALID (632007) +#define CNRT_RET_ERR_INVALID ((cnrtRet_t)632007) #endif namespace mluoptest { diff --git a/test/mlu_op_gtest/pb_gtest/src/executor.cpp b/test/mlu_op_gtest/pb_gtest/src/executor.cpp index b8ac27704..626f6c987 100644 --- a/test/mlu_op_gtest/pb_gtest/src/executor.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/executor.cpp @@ -380,7 +380,7 @@ void Executor::setupForPerfIter(int repeat, int iter, int iter_start) { if (perfUseOriginData()) { void *src_data = getPerfSrcData(db); GTEST_CHECK(cnrtMemcpy(db->device_perf_ptr, src_data, db->size, - CNRT_MEM_TRANS_DIR_DEV2DEV) == + cnrtMemcpyDevToDev) == cnrtSuccess); oss << "copy data from " << src_data; } else { @@ -460,7 +460,7 @@ void Executor::setupForPerfIter(int repeat, int iter, int iter_start) { if (skipMallocDevice(db.getMetaTensor())) continue; void *src_data = getPerfSrcData(&db); GTEST_CHECK(cnrtMemcpy(db.device_perf_ptr, src_data, db.size, - CNRT_MEM_TRANS_DIR_DEV2DEV) == cnrtSuccess); + cnrtMemcpyDevToDev) == cnrtSuccess); } } } @@ -2148,7 +2148,7 @@ void Executor::copyIn() { VLOG(4) << "copy from device_origin_ptr to device_perf_data_ptr"; GTEST_CHECK(cnrtSuccess == cnrtMemcpy(db->device_perf_data_ptr, db->device_origin_ptr, - db->size, CNRT_MEM_TRANS_DIR_DEV2DEV)); + db->size, cnrtMemcpyDevToDev)); } // for debug if (exe_config_->dump_data) { diff --git a/test/mlu_op_gtest/pb_gtest/src/hardware_monitor.cpp b/test/mlu_op_gtest/pb_gtest/src/hardware_monitor.cpp index 0e5f808ec..14d3a026a 100644 --- a/test/mlu_op_gtest/pb_gtest/src/hardware_monitor.cpp +++ b/test/mlu_op_gtest/pb_gtest/src/hardware_monitor.cpp @@ -65,10 +65,6 @@ void hardwareMonitor::start() { std::bind(&hardwareMonitor::monitorFrequencyOneGRepeat, this), monitor_hwtime); })); - monitor_threads.emplace_back(std::thread([&, this] { - monitorAllGRepeat(std::bind(&hardwareMonitor::monitorPowerOneGRepeat, this), - monitor_hwtime); - })); monitor_hwtime = true; monitor_threads.emplace_back(std::thread([&, this] { monitorAllGRepeat( @@ -166,62 +162,6 @@ void hardwareMonitor::monitorFrequencyOneGRepeat() { << "us per call."; } -void hardwareMonitor::monitorPowerOneGRepeat() { - std::ofstream power_file(results_dir + "/power_device_" + - std::to_string(global_var.dev_id_) + ".csv", - std::ios::app); - power_file << "relative_time(ns),instantaneous_power(W),average_power(W)\n"; - cndevDevice_t dev_id; - GTEST_CHECK(cnrtGetDevice(&dev_id) == cnrtSuccess); - GTEST_CHECK(cndevInit(0) == CNDEV_SUCCESS); - int i = 1; - - cndevPowerInfo_t power_info_prev, power_info_curr; - size_t t_prev, t_curr; - auto getPower = [&, this]() { - power_info_curr.version = CNDEV_VERSION_5; - t_curr = MONITOR_CLOCK::now().time_since_epoch().count() - start_time_point; - // TODO(None): cntoolkit-3.6, use cndevGetDevicePower - // GTEST_CHECK(cndevGetDevicePower(&power_info_curr, dev_id) == - // CNDEV_SUCCESS); - GTEST_CHECK(cndevGetPowerInfo(&power_info_curr, dev_id) == CNDEV_SUCCESS); - }; - - MONITOR_CLOCK::time_point t1 = MONITOR_CLOCK::now(); - getPower(); - std::tie(t_prev, power_info_prev) = std::make_tuple(t_curr, power_info_curr); - power_file << t_prev << "," - << (uint32_t)(power_info_prev.instantaneousPowerUsage) << "," - << power_info_prev.usage << "\n"; - while (!status.finish_one_grepeat) { - ++i; - getPower(); - if (power_info_prev.instantaneousPowerUsage != - power_info_curr.instantaneousPowerUsage || - power_info_prev.usage != power_info_curr.usage) { - power_file << t_prev << "," - << (uint32_t)(power_info_prev.instantaneousPowerUsage) << "," - << power_info_prev.usage << "\n"; - power_file << t_curr << "," - << (uint32_t)(power_info_curr.instantaneousPowerUsage) << "," - << power_info_curr.usage << "\n"; - power_info_prev = power_info_curr; - } - t_prev = t_curr; - } - power_file << t_curr << "," - << (uint32_t)(power_info_curr.instantaneousPowerUsage) << "," - << power_info_curr.usage << "\n"; - MONITOR_CLOCK::time_point t2 = MONITOR_CLOCK::now(); - auto time_span = - std::chrono::duration_cast>(t2 - - t1); - // TODO(None): cntoolkit-3.6, remove this warning - LOG(WARNING) << "From cntoolkit-3.6 onward, use cndevGetDevicePower."; - VLOG(4) << "cndevGetDevicePower took " << time_span.count() / i - << "us per call."; -} - void hardwareMonitor::monitorHwtimeOneGRepeat() { { std::unique_lock lock(status.monitor_mutex);