From 9012cc8f7909a4de9897213ed5eb24dc35e1ed16 Mon Sep 17 00:00:00 2001 From: chqy99 <141810829+chqy99@users.noreply.github.com> Date: Fri, 22 Sep 2023 18:02:13 +0800 Subject: [PATCH] Nolink check (#845) --- .../border_align_backward_union1.mlu | 19 +-- .../border_align_forward_union1.mlu | 6 +- .../focal_loss_sigmoid_forward_union1.mlu | 6 +- .../indice_convolution_backward_data.h | 4 +- .../msda_forward_union1_default.mlu | 123 +++++++----------- 5 files changed, 55 insertions(+), 103 deletions(-) diff --git a/bangc-ops/kernels/border_align_backward/border_align_backward_union1.mlu b/bangc-ops/kernels/border_align_backward/border_align_backward_union1.mlu index 61c23ad01..23b1b2e6a 100644 --- a/bangc-ops/kernels/border_align_backward/border_align_backward_union1.mlu +++ b/bangc-ops/kernels/border_align_backward/border_align_backward_union1.mlu @@ -165,32 +165,25 @@ __mlu_func__ void computeImpl(T *nram_grad_output, const T *grad_output, &x_high, &y_low, &y_high, &empty); if (!empty) { // load argmax, - __memcpy(nram_argmax_idx, argmax_idx + src_offset, - deal_num * sizeof(int32_t), GDRAM2NRAM); // NOLINT + __memcpy(nram_argmax_idx, argmax_idx + src_offset, deal_num * sizeof(int32_t), GDRAM2NRAM); // NOLINT /* Creat mask, mask.shape([1, deal_num]) is the same as argmax_idx * mask[1, j] = (T)1 if (argmax_idx[1, j] == pool_idx) * = (T)0 otherwise */ __bang_write_value(nram_grad_output, deal_num_align, int32_t(i)); - __bang_eq(nram_argmax_idx, nram_argmax_idx, (int32_t *)nram_grad_output, - deal_num_align); // NOLINT + __bang_eq(nram_argmax_idx, nram_argmax_idx, (int32_t *)nram_grad_output, deal_num_align); // NOLINT if (__mluop_is_float()) { __nram__ int32_t table[COMPUTE_COUNT_ALIGN] = {0, (int32_t)0xffffffff}; - __bang_lut_s32((int32_t *)nram_argmax_idx, (int32_t *)nram_argmax_idx, - table, deal_num_align, COMPUTE_COUNT_ALIGN); // NOLINT + __bang_lut_s32((int32_t *)nram_argmax_idx, (int32_t *)nram_argmax_idx, table, deal_num_align, COMPUTE_COUNT_ALIGN); // NOLINT } else { __nram__ int16_t table[COMPUTE_COUNT_ALIGN] = {0, (int16_t)0xffff}; - __bang_int322int16((int16_t *)nram_argmax_idx, - (int32_t *)nram_argmax_idx, deal_num_align, 0, - 0); // NOLINT - __bang_lut_s16((int16_t *)nram_argmax_idx, (int16_t *)nram_argmax_idx, - table, deal_num_align, COMPUTE_COUNT_ALIGN); // NOLINT + __bang_int322int16((int16_t *)nram_argmax_idx, (int32_t *)nram_argmax_idx, deal_num_align, 0, 0); // NOLINT + __bang_lut_s16((int16_t *)nram_argmax_idx, (int16_t *)nram_argmax_idx, table, deal_num_align, COMPUTE_COUNT_ALIGN); // NOLINT } // load grad_output, and calculate grad_input - __memcpy(nram_grad_output, grad_output + src_offset, deal_num * sizeof(T), - GDRAM2NRAM); // NOLINT + __memcpy(nram_grad_output, grad_output + src_offset, deal_num * sizeof(T), GDRAM2NRAM); // NOLINT computeGradInput(nram_grad_input, nram_grad_output, grad_input, (T *)nram_argmax_idx, w1, w2, w3, w4, x_low, y_low, x_high, y_high, origin_c, c, origin_w, n, origin_h, diff --git a/bangc-ops/kernels/border_align_forward/border_align_forward_union1.mlu b/bangc-ops/kernels/border_align_forward/border_align_forward_union1.mlu index b18449b6f..8409ad96f 100644 --- a/bangc-ops/kernels/border_align_forward/border_align_forward_union1.mlu +++ b/bangc-ops/kernels/border_align_forward/border_align_forward_union1.mlu @@ -258,10 +258,8 @@ __mlu_func__ void pipeline(T *input_ping_nram, const T *input, T *boxes_nram, deal_num_align); // S - __memcpy(base_output + c_offset, output_nram, deal_num * sizeof(T), - NRAM2GDRAM); // NOLINT - __memcpy(base_argmax_idx + c_offset, argmax_idx_nram, - deal_num * sizeof(int32_t), NRAM2GDRAM); // NOLINT + __memcpy(base_output + c_offset, output_nram, deal_num * sizeof(T), NRAM2GDRAM); // NOLINT + __memcpy(base_argmax_idx + c_offset, argmax_idx_nram, deal_num * sizeof(int32_t), NRAM2GDRAM); // NOLINT } template diff --git a/bangc-ops/kernels/focal_loss_sigmoid/focal_loss_sigmoid_forward_union1.mlu b/bangc-ops/kernels/focal_loss_sigmoid/focal_loss_sigmoid_forward_union1.mlu index 92bf121e1..4e833c675 100644 --- a/bangc-ops/kernels/focal_loss_sigmoid/focal_loss_sigmoid_forward_union1.mlu +++ b/bangc-ops/kernels/focal_loss_sigmoid/focal_loss_sigmoid_forward_union1.mlu @@ -232,10 +232,8 @@ __mlu_func__ void compute(const focalLossSigmoidPreference_t prefer, __bang_le_scalar(input, compute_b, (float)FLT_MAX, deal_num); __bang_float2int32((int32_t *)input, input, deal_num, 0); __nram__ int32_t table[COMPUTE_COUNT_ALIGN] = {0, (int32_t)0xffffffff}; - __bang_lut_s32((int32_t *)input, (int32_t *)input, table, deal_num, - COMPUTE_COUNT_ALIGN); // NOLINT - __bang_band((char *)compute_b, (char *)compute_b, (char *)input, - sizeof(float) * deal_num); // NOLINT + __bang_lut_s32((int32_t *)input, (int32_t *)input, table, deal_num, COMPUTE_COUNT_ALIGN); // NOLINT + __bang_band((char *)compute_b, (char *)compute_b, (char *)input, sizeof(float) * deal_num); // NOLINT __bang_sub(compute_a, compute_a, compute_b, deal_num); // 3. output = alpha_t * p_t^r * [-log(p_t)] diff --git a/bangc-ops/kernels/indice_convolution_backward_data/indice_convolution_backward_data.h b/bangc-ops/kernels/indice_convolution_backward_data/indice_convolution_backward_data.h index 24bdc0d03..88979e70c 100644 --- a/bangc-ops/kernels/indice_convolution_backward_data/indice_convolution_backward_data.h +++ b/bangc-ops/kernels/indice_convolution_backward_data/indice_convolution_backward_data.h @@ -36,6 +36,4 @@ inline int getMaxNumInArray(const int64_t arr[], const int num) { return max_num; } -#endif -// KERNELS_INDICE_CONVOLUTION_BACKWARD_DATA_INDICE_CONVOLUTION_BACKWARD_DATA_H_ -// NOLINT +#endif // KERNELS_INDICE_CONVOLUTION_BACKWARD_DATA_INDICE_CONVOLUTION_BACKWARD_DATA_H_ // NOLINT diff --git a/bangc-ops/kernels/ms_deform_attn_forward/msda_forward_union1_default.mlu b/bangc-ops/kernels/ms_deform_attn_forward/msda_forward_union1_default.mlu index 14540f7f9..7f59f6036 100644 --- a/bangc-ops/kernels/ms_deform_attn_forward/msda_forward_union1_default.mlu +++ b/bangc-ops/kernels/ms_deform_attn_forward/msda_forward_union1_default.mlu @@ -188,8 +188,7 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault( taskId < block_num_rem ? (batch_size * num_queries * num_heads) / taskDim + 1 : (batch_size * num_queries * num_heads) / taskDim; - for (int32_t cur_idx = idx_start; cur_idx < idx_start + block_num_per_core; - ++cur_idx) { // NOLINT + for (int32_t cur_idx = idx_start; cur_idx < idx_start + block_num_per_core; ++cur_idx) { // NOLINT /* cur_idx = batch_idx * num_queries * num_heads + query_idx * num_heads + head_idx @@ -256,13 +255,11 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault( c_seg_idx * span_num_deal) * sizeof(T); loc_w = ((T *)data_sampling_loc_gdram_start) - [(level_idx * num_points + point_idx + 1) * 2]; // NOLINT + [(level_idx * num_points + point_idx + 1) * 2]; loc_h = ((T *)data_sampling_loc_gdram_start) - [(level_idx * num_points + point_idx + 1) * 2 + // NOLINT - 1]; - weight_next_point = - ((T *)data_attn_weight_gdram_start)[level_idx * num_points + - point_idx + 1]; // NOLINT + [(level_idx * num_points + point_idx + 1) * 2 + 1]; + weight_next_point = ((T *)data_attn_weight_gdram_start) + [level_idx * num_points + point_idx + 1]; x_next_point = loc_w * spatial_w_next_point - 0.5; y_next_point = loc_h * spatial_h_next_point - 0.5; if (y_next_point > -1 && x_next_point > -1 && @@ -271,17 +268,13 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault( loadNeighborPointsData( (T *)data_value_ptr, (T *)(ping_data_value_p1_nram + - ((level_idx * num_points + point_idx + 1) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT (T *)(ping_data_value_p2_nram + - ((level_idx * num_points + point_idx + 1) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT (T *)(ping_data_value_p3_nram + - ((level_idx * num_points + point_idx + 1) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT (T *)(ping_data_value_p4_nram + - ((level_idx * num_points + point_idx + 1) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT span_num_deal, spatial_w_next_point, spatial_h_next_point, num_heads, channels, x_next_point, y_next_point, head_idx); } @@ -289,32 +282,26 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault( spatial_h_next_point = spatial_h; spatial_w_next_point = spatial_w; loc_w = ((T *)data_sampling_loc_gdram_start) - [(level_idx * num_points + point_idx + 1) * 2]; // NOLINT + [(level_idx * num_points + point_idx + 1) * 2]; loc_h = ((T *)data_sampling_loc_gdram_start) - [(level_idx * num_points + point_idx + 1) * 2 + // NOLINT - 1]; - weight_next_point = - ((T *)data_attn_weight_gdram_start)[level_idx * num_points + - point_idx + 1]; // NOLINT + [(level_idx * num_points + point_idx + 1) * 2 + 1]; + weight_next_point = ((T *)data_attn_weight_gdram_start) + [level_idx * num_points + point_idx + 1]; x_next_point = loc_w * spatial_w - 0.5; y_next_point = loc_h * spatial_h - 0.5; if (y_next_point > -1 && x_next_point > -1 && - y_next_point < spatial_h && // NOLINT + y_next_point < spatial_h && x_next_point < spatial_w) { loadNeighborPointsData( (T *)data_value_ptr, (T *)(ping_data_value_p1_nram + - ((level_idx * num_points + point_idx + 1) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT (T *)(ping_data_value_p2_nram + - ((level_idx * num_points + point_idx + 1) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT (T *)(ping_data_value_p3_nram + - ((level_idx * num_points + point_idx + 1) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT (T *)(ping_data_value_p4_nram + - ((level_idx * num_points + point_idx + 1) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT span_num_deal, spatial_w, spatial_h, num_heads, channels, x_next_point, y_next_point, head_idx); } @@ -323,20 +310,15 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault( if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) { computeMsDeformAttn( (T *)(ping_data_value_p1_nram + - ((level_idx * num_points + point_idx) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT (T *)(ping_data_value_p2_nram + - ((level_idx * num_points + point_idx) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT (T *)(ping_data_value_p3_nram + - ((level_idx * num_points + point_idx) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT (T *)(ping_data_value_p4_nram + - ((level_idx * num_points + point_idx) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT (T *)auxiliary_a, (T *)auxiliary_b, - (T *)(ping_data_col_nram + - data_col_ping_pong_idx * ping_pong_gap), // NOLINT + (T *)(ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap), // NOLINT weight, span_num_deal, spatial_w, spatial_h, x, y); } spatial_w = spatial_w_next_point; @@ -404,13 +386,11 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault( channels_seg_num * span_num_deal) * sizeof(T); loc_w = ((T *)data_sampling_loc_gdram_start) - [(level_idx * num_points + point_idx + 1) * 2]; // NOLINT + [(level_idx * num_points + point_idx + 1) * 2]; loc_h = ((T *)data_sampling_loc_gdram_start) - [(level_idx * num_points + point_idx + 1) * 2 + // NOLINT - 1]; - weight_next_point = - ((T *)data_attn_weight_gdram_start)[level_idx * num_points + - point_idx + 1]; // NOLINT + [(level_idx * num_points + point_idx + 1) * 2 + 1]; + weight_next_point = ((T *)data_attn_weight_gdram_start) + [level_idx * num_points + point_idx + 1]; x_next_point = loc_w * spatial_w_next_point - 0.5; y_next_point = loc_h * spatial_h_next_point - 0.5; if (y_next_point > -1 && x_next_point > -1 && @@ -419,17 +399,13 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault( loadNeighborPointsData( (T *)data_value_ptr, (T *)(ping_data_value_p1_nram + - ((level_idx * num_points + point_idx + 1) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT (T *)(ping_data_value_p2_nram + - ((level_idx * num_points + point_idx + 1) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT (T *)(ping_data_value_p3_nram + - ((level_idx * num_points + point_idx + 1) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT (T *)(ping_data_value_p4_nram + - ((level_idx * num_points + point_idx + 1) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT channels_rem, spatial_w_next_point, spatial_h_next_point, num_heads, channels, x_next_point, y_next_point, head_idx); } @@ -437,13 +413,11 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault( spatial_w_next_point = spatial_w; spatial_h_next_point = spatial_h; loc_w = ((T *)data_sampling_loc_gdram_start) - [(level_idx * num_points + point_idx + 1) * 2]; // NOLINT + [(level_idx * num_points + point_idx + 1) * 2]; loc_h = ((T *)data_sampling_loc_gdram_start) - [(level_idx * num_points + point_idx + 1) * 2 + // NOLINT - 1]; - weight_next_point = - ((T *)data_attn_weight_gdram_start)[level_idx * num_points + - point_idx + 1]; // NOLINT + [(level_idx * num_points + point_idx + 1) * 2 + 1]; + weight_next_point = ((T *)data_attn_weight_gdram_start) + [level_idx * num_points + point_idx + 1]; x_next_point = loc_w * spatial_w - 0.5; y_next_point = loc_h * spatial_h - 0.5; if (y_next_point > -1 && x_next_point > -1 && @@ -451,17 +425,13 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault( loadNeighborPointsData( (T *)data_value_ptr, (T *)(ping_data_value_p1_nram + - ((level_idx * num_points + point_idx + 1) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT (T *)(ping_data_value_p2_nram + - ((level_idx * num_points + point_idx + 1) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT (T *)(ping_data_value_p3_nram + - ((level_idx * num_points + point_idx + 1) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT (T *)(ping_data_value_p4_nram + - ((level_idx * num_points + point_idx + 1) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT channels_rem, spatial_w, spatial_h, num_heads, channels, x_next_point, y_next_point, head_idx); } @@ -470,20 +440,15 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault( if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) { computeMsDeformAttn( (T *)(ping_data_value_p1_nram + - ((level_idx * num_points + point_idx) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT (T *)(ping_data_value_p2_nram + - ((level_idx * num_points + point_idx) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT (T *)(ping_data_value_p3_nram + - ((level_idx * num_points + point_idx) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT (T *)(ping_data_value_p4_nram + - ((level_idx * num_points + point_idx) % 2) * - ping_pong_gap), // NOLINT + ((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT (T *)auxiliary_a, (T *)auxiliary_b, - (T *)(ping_data_col_nram + - data_col_ping_pong_idx * ping_pong_gap), // NOLINT + (T *)(ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap), // NOLINT weight, channels_align_rem, spatial_w, spatial_h, x, y); } spatial_w = spatial_w_next_point;