Skip to content

Commit

Permalink
Nolink check (#845)
Browse files Browse the repository at this point in the history
  • Loading branch information
chqy99 authored Sep 22, 2023
1 parent d3ecbf7 commit 9012cc8
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 103 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -165,32 +165,25 @@ __mlu_func__ void computeImpl(T *nram_grad_output, const T *grad_output,
&x_high, &y_low, &y_high, &empty);
if (!empty) {
// load argmax,
__memcpy(nram_argmax_idx, argmax_idx + src_offset,
deal_num * sizeof(int32_t), GDRAM2NRAM); // NOLINT
__memcpy(nram_argmax_idx, argmax_idx + src_offset, deal_num * sizeof(int32_t), GDRAM2NRAM); // NOLINT

/* Creat mask, mask.shape([1, deal_num]) is the same as argmax_idx
* mask[1, j] = (T)1 if (argmax_idx[1, j] == pool_idx)
* = (T)0 otherwise
*/
__bang_write_value(nram_grad_output, deal_num_align, int32_t(i));
__bang_eq(nram_argmax_idx, nram_argmax_idx, (int32_t *)nram_grad_output,
deal_num_align); // NOLINT
__bang_eq(nram_argmax_idx, nram_argmax_idx, (int32_t *)nram_grad_output, deal_num_align); // NOLINT
if (__mluop_is_float<T>()) {
__nram__ int32_t table[COMPUTE_COUNT_ALIGN] = {0, (int32_t)0xffffffff};
__bang_lut_s32((int32_t *)nram_argmax_idx, (int32_t *)nram_argmax_idx,
table, deal_num_align, COMPUTE_COUNT_ALIGN); // NOLINT
__bang_lut_s32((int32_t *)nram_argmax_idx, (int32_t *)nram_argmax_idx, table, deal_num_align, COMPUTE_COUNT_ALIGN); // NOLINT
} else {
__nram__ int16_t table[COMPUTE_COUNT_ALIGN] = {0, (int16_t)0xffff};
__bang_int322int16((int16_t *)nram_argmax_idx,
(int32_t *)nram_argmax_idx, deal_num_align, 0,
0); // NOLINT
__bang_lut_s16((int16_t *)nram_argmax_idx, (int16_t *)nram_argmax_idx,
table, deal_num_align, COMPUTE_COUNT_ALIGN); // NOLINT
__bang_int322int16((int16_t *)nram_argmax_idx, (int32_t *)nram_argmax_idx, deal_num_align, 0, 0); // NOLINT
__bang_lut_s16((int16_t *)nram_argmax_idx, (int16_t *)nram_argmax_idx, table, deal_num_align, COMPUTE_COUNT_ALIGN); // NOLINT
}

// load grad_output, and calculate grad_input
__memcpy(nram_grad_output, grad_output + src_offset, deal_num * sizeof(T),
GDRAM2NRAM); // NOLINT
__memcpy(nram_grad_output, grad_output + src_offset, deal_num * sizeof(T), GDRAM2NRAM); // NOLINT
computeGradInput(nram_grad_input, nram_grad_output, grad_input,
(T *)nram_argmax_idx, w1, w2, w3, w4, x_low, y_low,
x_high, y_high, origin_c, c, origin_w, n, origin_h,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -258,10 +258,8 @@ __mlu_func__ void pipeline(T *input_ping_nram, const T *input, T *boxes_nram,
deal_num_align);

// S
__memcpy(base_output + c_offset, output_nram, deal_num * sizeof(T),
NRAM2GDRAM); // NOLINT
__memcpy(base_argmax_idx + c_offset, argmax_idx_nram,
deal_num * sizeof(int32_t), NRAM2GDRAM); // NOLINT
__memcpy(base_output + c_offset, output_nram, deal_num * sizeof(T), NRAM2GDRAM); // NOLINT
__memcpy(base_argmax_idx + c_offset, argmax_idx_nram, deal_num * sizeof(int32_t), NRAM2GDRAM); // NOLINT
}

template <typename T>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,10 +232,8 @@ __mlu_func__ void compute(const focalLossSigmoidPreference_t prefer,
__bang_le_scalar(input, compute_b, (float)FLT_MAX, deal_num);
__bang_float2int32((int32_t *)input, input, deal_num, 0);
__nram__ int32_t table[COMPUTE_COUNT_ALIGN] = {0, (int32_t)0xffffffff};
__bang_lut_s32((int32_t *)input, (int32_t *)input, table, deal_num,
COMPUTE_COUNT_ALIGN); // NOLINT
__bang_band((char *)compute_b, (char *)compute_b, (char *)input,
sizeof(float) * deal_num); // NOLINT
__bang_lut_s32((int32_t *)input, (int32_t *)input, table, deal_num, COMPUTE_COUNT_ALIGN); // NOLINT
__bang_band((char *)compute_b, (char *)compute_b, (char *)input, sizeof(float) * deal_num); // NOLINT
__bang_sub(compute_a, compute_a, compute_b, deal_num);

// 3. output = alpha_t * p_t^r * [-log(p_t)]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,4 @@ inline int getMaxNumInArray(const int64_t arr[], const int num) {
return max_num;
}

#endif
// KERNELS_INDICE_CONVOLUTION_BACKWARD_DATA_INDICE_CONVOLUTION_BACKWARD_DATA_H_
// NOLINT
#endif // KERNELS_INDICE_CONVOLUTION_BACKWARD_DATA_INDICE_CONVOLUTION_BACKWARD_DATA_H_ // NOLINT
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,7 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault(
taskId < block_num_rem
? (batch_size * num_queries * num_heads) / taskDim + 1
: (batch_size * num_queries * num_heads) / taskDim;
for (int32_t cur_idx = idx_start; cur_idx < idx_start + block_num_per_core;
++cur_idx) { // NOLINT
for (int32_t cur_idx = idx_start; cur_idx < idx_start + block_num_per_core; ++cur_idx) { // NOLINT
/*
cur_idx = batch_idx * num_queries * num_heads +
query_idx * num_heads + head_idx
Expand Down Expand Up @@ -256,13 +255,11 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault(
c_seg_idx * span_num_deal) *
sizeof(T);
loc_w = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2]; // NOLINT
[(level_idx * num_points + point_idx + 1) * 2];
loc_h = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2 + // NOLINT
1];
weight_next_point =
((T *)data_attn_weight_gdram_start)[level_idx * num_points +
point_idx + 1]; // NOLINT
[(level_idx * num_points + point_idx + 1) * 2 + 1];
weight_next_point = ((T *)data_attn_weight_gdram_start)
[level_idx * num_points + point_idx + 1];
x_next_point = loc_w * spatial_w_next_point - 0.5;
y_next_point = loc_h * spatial_h_next_point - 0.5;
if (y_next_point > -1 && x_next_point > -1 &&
Expand All @@ -271,50 +268,40 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault(
loadNeighborPointsData(
(T *)data_value_ptr,
(T *)(ping_data_value_p1_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT
(T *)(ping_data_value_p2_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT
(T *)(ping_data_value_p3_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT
(T *)(ping_data_value_p4_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT
span_num_deal, spatial_w_next_point, spatial_h_next_point,
num_heads, channels, x_next_point, y_next_point, head_idx);
}
} else {
spatial_h_next_point = spatial_h;
spatial_w_next_point = spatial_w;
loc_w = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2]; // NOLINT
[(level_idx * num_points + point_idx + 1) * 2];
loc_h = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2 + // NOLINT
1];
weight_next_point =
((T *)data_attn_weight_gdram_start)[level_idx * num_points +
point_idx + 1]; // NOLINT
[(level_idx * num_points + point_idx + 1) * 2 + 1];
weight_next_point = ((T *)data_attn_weight_gdram_start)
[level_idx * num_points + point_idx + 1];
x_next_point = loc_w * spatial_w - 0.5;
y_next_point = loc_h * spatial_h - 0.5;
if (y_next_point > -1 && x_next_point > -1 &&
y_next_point < spatial_h && // NOLINT
y_next_point < spatial_h &&
x_next_point < spatial_w) {
loadNeighborPointsData(
(T *)data_value_ptr,
(T *)(ping_data_value_p1_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT
(T *)(ping_data_value_p2_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT
(T *)(ping_data_value_p3_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT
(T *)(ping_data_value_p4_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT
span_num_deal, spatial_w, spatial_h, num_heads, channels,
x_next_point, y_next_point, head_idx);
}
Expand All @@ -323,20 +310,15 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault(
if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
computeMsDeformAttn(
(T *)(ping_data_value_p1_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT
(T *)(ping_data_value_p2_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT
(T *)(ping_data_value_p3_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT
(T *)(ping_data_value_p4_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT
(T *)auxiliary_a, (T *)auxiliary_b,
(T *)(ping_data_col_nram +
data_col_ping_pong_idx * ping_pong_gap), // NOLINT
(T *)(ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap), // NOLINT
weight, span_num_deal, spatial_w, spatial_h, x, y);
}
spatial_w = spatial_w_next_point;
Expand Down Expand Up @@ -404,13 +386,11 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault(
channels_seg_num * span_num_deal) *
sizeof(T);
loc_w = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2]; // NOLINT
[(level_idx * num_points + point_idx + 1) * 2];
loc_h = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2 + // NOLINT
1];
weight_next_point =
((T *)data_attn_weight_gdram_start)[level_idx * num_points +
point_idx + 1]; // NOLINT
[(level_idx * num_points + point_idx + 1) * 2 + 1];
weight_next_point = ((T *)data_attn_weight_gdram_start)
[level_idx * num_points + point_idx + 1];
x_next_point = loc_w * spatial_w_next_point - 0.5;
y_next_point = loc_h * spatial_h_next_point - 0.5;
if (y_next_point > -1 && x_next_point > -1 &&
Expand All @@ -419,49 +399,39 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault(
loadNeighborPointsData(
(T *)data_value_ptr,
(T *)(ping_data_value_p1_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT
(T *)(ping_data_value_p2_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT
(T *)(ping_data_value_p3_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT
(T *)(ping_data_value_p4_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT
channels_rem, spatial_w_next_point, spatial_h_next_point,
num_heads, channels, x_next_point, y_next_point, head_idx);
}
} else {
spatial_w_next_point = spatial_w;
spatial_h_next_point = spatial_h;
loc_w = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2]; // NOLINT
[(level_idx * num_points + point_idx + 1) * 2];
loc_h = ((T *)data_sampling_loc_gdram_start)
[(level_idx * num_points + point_idx + 1) * 2 + // NOLINT
1];
weight_next_point =
((T *)data_attn_weight_gdram_start)[level_idx * num_points +
point_idx + 1]; // NOLINT
[(level_idx * num_points + point_idx + 1) * 2 + 1];
weight_next_point = ((T *)data_attn_weight_gdram_start)
[level_idx * num_points + point_idx + 1];
x_next_point = loc_w * spatial_w - 0.5;
y_next_point = loc_h * spatial_h - 0.5;
if (y_next_point > -1 && x_next_point > -1 &&
y_next_point < spatial_h && x_next_point < spatial_w) {
loadNeighborPointsData(
(T *)data_value_ptr,
(T *)(ping_data_value_p1_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT
(T *)(ping_data_value_p2_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT
(T *)(ping_data_value_p3_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT
(T *)(ping_data_value_p4_nram +
((level_idx * num_points + point_idx + 1) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap), // NOLINT
channels_rem, spatial_w, spatial_h, num_heads, channels,
x_next_point, y_next_point, head_idx);
}
Expand All @@ -470,20 +440,15 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardDefault(
if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
computeMsDeformAttn(
(T *)(ping_data_value_p1_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT
(T *)(ping_data_value_p2_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT
(T *)(ping_data_value_p3_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT
(T *)(ping_data_value_p4_nram +
((level_idx * num_points + point_idx) % 2) *
ping_pong_gap), // NOLINT
((level_idx * num_points + point_idx) % 2) * ping_pong_gap), // NOLINT
(T *)auxiliary_a, (T *)auxiliary_b,
(T *)(ping_data_col_nram +
data_col_ping_pong_idx * ping_pong_gap), // NOLINT
(T *)(ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap), // NOLINT
weight, channels_align_rem, spatial_w, spatial_h, x, y);
}
spatial_w = spatial_w_next_point;
Expand Down

0 comments on commit 9012cc8

Please sign in to comment.