Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Fix](bangc-ops): replace __bang_atomic_add with __bang_atomic_reduce… #854

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions bangc-ops/kernels/carafe/carafe_block.mlu
Original file line number Diff line number Diff line change
Expand Up @@ -367,8 +367,7 @@ __mlu_global__ void MLUKernelCarafeBackward(T *input, T *mask, T *grad_output,
(T *)nram_buf + 2 * NRAM_BLOCK / sizeof(T),
(T *)nram_buf + 3 * NRAM_BLOCK / sizeof(T),
((T *)nram_buf + NRAM_BLOCK / sizeof(T))[mask_index], num_align);
__bang_atomic_add(
(T *)nram_buf + 2 * NRAM_BLOCK / sizeof(T), (T *)base_grad_input,
__bang_atomic_reduce_add((T *)base_grad_input,
(T *)nram_buf + 2 * NRAM_BLOCK / sizeof(T), num_align);
__bang_mul((T *)nram_buf, (T *)nram_buf + 3 * NRAM_BLOCK / sizeof(T),
(T *)nram_buf, num_align);
Expand Down Expand Up @@ -411,8 +410,7 @@ __mlu_global__ void MLUKernelCarafeBackward(T *input, T *mask, T *grad_output,
(T *)nram_buf + 3 * NRAM_BLOCK / sizeof(T),
((T *)nram_buf + NRAM_BLOCK / sizeof(T))[mask_index],
rem_for_loop_align);
__bang_atomic_add(
(T *)nram_buf + 2 * NRAM_BLOCK / sizeof(T), (T *)base_grad_input,
__bang_atomic_reduce_add((T *)base_grad_input,
(T *)nram_buf + 2 * NRAM_BLOCK / sizeof(T), rem_for_loop);
__bang_mul((T *)nram_buf, (T *)nram_buf + 3 * NRAM_BLOCK / sizeof(T),
(T *)nram_buf, rem_for_loop_align);
Expand Down
18 changes: 6 additions & 12 deletions bangc-ops/kernels/deform_roi_pool/deform_roi_pool_union1.mlu
Original file line number Diff line number Diff line change
Expand Up @@ -503,23 +503,19 @@ __mlu_func__ void MLUMultiKernelDeformRoiPoolBackward(
__bang_mul_scalar((T *)nram_tmp4, (T *)nram_grad_output, w4,
channels_align);
__sync();
__bang_atomic_add(
(T *)nram_tmp1,
__bang_atomic_reduce_add(
(T *)(offset_grad_input + (y_low * width + x_low) * channels +
channel_offset),
(T *)nram_tmp1, channels_num);
__bang_atomic_add(
(T *)nram_tmp2,
__bang_atomic_reduce_add(
(T *)(offset_grad_input + (y_low * width + x_high) * channels +
channel_offset),
(T *)nram_tmp2, channels_num);
__bang_atomic_add(
(T *)nram_tmp3,
__bang_atomic_reduce_add(
(T *)(offset_grad_input + (y_high * width + x_low) * channels +
channel_offset),
(T *)nram_tmp3, channels_num);
__bang_atomic_add(
(T *)nram_tmp4,
__bang_atomic_reduce_add(
(T *)(offset_grad_input + (y_high * width + x_high) * channels +
channel_offset),
(T *)nram_tmp4, channels_num);
Expand Down Expand Up @@ -645,8 +641,7 @@ __mlu_func__ void MLUMultiKernelDeformRoiPoolBackward(
kernel_width, 1, kernel_width, kernel_width, 1);
__bang_reduce_sum(nram_sum_tmp, nram_sum_tmp,
nram_sum_tmp_channel);
__bang_atomic_add(
(T *)nram_sum_tmp,
__bang_atomic_reduce_add(
(T *)(grad_offset +
out_batch * pooled_width * pooled_height * 2 +
out_height * pooled_width + out_width),
Expand All @@ -670,8 +665,7 @@ __mlu_func__ void MLUMultiKernelDeformRoiPoolBackward(
kernel_width, 1, kernel_width, kernel_width, 1);
__bang_reduce_sum(nram_sum_tmp, nram_sum_tmp,
NFU_ALIGN_SIZE / sizeof(T));
__bang_atomic_add(
(T *)nram_sum_tmp,
__bang_atomic_reduce_add(
(T *)(grad_offset +
out_batch * pooled_width * pooled_height * 2 +
pooled_width * pooled_height +
Expand Down
3 changes: 1 addition & 2 deletions bangc-ops/kernels/psroipool/psroipool_block.mlu
Original file line number Diff line number Diff line change
Expand Up @@ -285,8 +285,7 @@ __mlu_func__ void psRoiAvgPoolBackwardCompute(
for (int h = hstart; h < hend; h++) {
for (int w = wstart; w < wend; w++) {
int bottom_offset = bottom_add + (h * width + w) * channels + c;
__bang_atomic_add(atomic_buffer, bottom_grad + bottom_offset, diff_val,
1);
__bang_atomic_reduce_add(bottom_grad + bottom_offset, diff_val, 1);
}
}
}
Expand Down
12 changes: 4 additions & 8 deletions bangc-ops/kernels/roi_align_rotated/roi_align_rotated_block.mlu
Original file line number Diff line number Diff line change
Expand Up @@ -418,26 +418,22 @@ __mlu_global__ void roiAlignRotatedBackward(
continue;
} else {
__bang_mul_scalar(nram_output, nram_ping, w1 * zero_sign, c_limit);
__bang_atomic_add(
(T *)nram_output,
__bang_atomic_reduce_add(
bottom_grad_dram + batch_idx * height * width * channel +
y_low * width * channel + x_low * channel + c_offset,
(T *)nram_output, c_slice);
__bang_mul_scalar(nram_output, nram_ping, w2 * zero_sign, c_limit);
__bang_atomic_add(
(T *)nram_output,
__bang_atomic_reduce_add(
bottom_grad_dram + batch_idx * height * width * channel +
y_low * width * channel + x_high * channel + c_offset,
(T *)nram_output, c_slice);
__bang_mul_scalar(nram_output, nram_ping, w3 * zero_sign, c_limit);
__bang_atomic_add(
(T *)nram_output,
__bang_atomic_reduce_add(
bottom_grad_dram + batch_idx * height * width * channel +
y_high * width * channel + x_low * channel + c_offset,
(T *)nram_output, c_slice);
__bang_mul_scalar(nram_output, nram_ping, w4 * zero_sign, c_limit);
__bang_atomic_add(
(T *)nram_output,
__bang_atomic_reduce_add(
bottom_grad_dram + batch_idx * height * width * channel +
y_high * width * channel + x_high * channel + c_offset,
(T *)nram_output, c_slice);
Expand Down
23 changes: 10 additions & 13 deletions bangc-ops/kernels/roi_crop/roi_crop_block.mlu
Original file line number Diff line number Diff line change
Expand Up @@ -328,31 +328,28 @@ __mlu_global__ void MLUKernelRoiCropBackward(
}
// compute
if (topLeftIsIn) {
__bang_mul_scalar(nram_output, nram_ping, i_tl_x_weight * i_tl_y_weight,
c_limit);
__bang_atomic_add(nram_output, grad_input + gi_tl_offset + c_offset,
nram_output, c_slice);
__bang_mul_scalar(nram_output, nram_ping,
i_tl_x_weight * i_tl_y_weight, c_limit);
__bang_atomic_reduce_add(grad_input + gi_tl_offset + c_offset,
nram_output, c_slice);
}
if (topRightIsIn) {
__bang_mul_scalar(nram_output + c_limit, nram_ping,
(1 - i_tl_x_weight) * i_tl_y_weight, c_limit);
__bang_atomic_add(nram_output + c_limit,
grad_input + gi_tr_offset + c_offset,
nram_output + c_limit, c_slice);
__bang_atomic_reduce_add(grad_input + gi_tr_offset + c_offset,
nram_output + c_limit, c_slice);
}
if (bottomLeftIsIn) {
__bang_mul_scalar(nram_output + 2 * c_limit, nram_ping,
i_tl_x_weight * (1 - i_tl_y_weight), c_limit);
__bang_atomic_add(nram_output + 2 * c_limit,
grad_input + gi_bl_offset + c_offset,
nram_output + 2 * c_limit, c_slice);
__bang_atomic_reduce_add(grad_input + gi_bl_offset + c_offset,
nram_output + 2 * c_limit, c_slice);
}
if (bottomRightIsIn) {
__bang_mul_scalar(nram_output + 3 * c_limit, nram_ping,
(1 - i_tl_x_weight) * (1 - i_tl_y_weight), c_limit);
__bang_atomic_add(nram_output + 3 * c_limit,
grad_input + gi_br_offset + c_offset,
nram_output + 3 * c_limit, c_slice);
__bang_atomic_reduce_add(grad_input + gi_br_offset + c_offset,
nram_output + 3 * c_limit, c_slice);
}
c_rem -= c_slice;
c_offset += c_slice;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -600,14 +600,13 @@ __mlu_global__ void MLUKernelRotatedFeatureAlignBackward(
const T *cur_br = bottom_input + n_offset +
p_y_high * width * channels + p_x_high * channels +
channel_offset;
__bang_atomic_add((T *)nram_ping, (T *)cur_tl, (T *)nram_ping,
channels_num);
__bang_atomic_add((T *)(nram_ping + deal_num), (T *)cur_tr,
(T *)(nram_ping + deal_num), channels_num);
__bang_atomic_add((T *)(nram_ping + 2 * deal_num), (T *)cur_bl,
(T *)(nram_ping + 2 * deal_num), channels_num);
__bang_atomic_add((T *)(nram_ping + 3 * deal_num), (T *)cur_br,
(T *)(nram_ping + 3 * deal_num), channels_num);
__bang_atomic_reduce_add((T *)cur_tl, (T *)nram_ping, channels_num);
__bang_atomic_reduce_add((T *)cur_tr, (T *)(nram_ping + deal_num),
channels_num);
__bang_atomic_reduce_add((T *)cur_bl, (T *)(nram_ping + 2 * deal_num),
channels_num);
__bang_atomic_reduce_add((T *)cur_br, (T *)(nram_ping + 3 * deal_num),
channels_num);
}
__sync();
swap_ptr(nram_ping, nram_pong);
Expand All @@ -629,18 +628,17 @@ __mlu_global__ void MLUKernelRotatedFeatureAlignBackward(
const T *cur_br = bottom_input + n_offset +
p_y_high * width * channels + p_x_high * channels +
channel_offset;
__bang_atomic_add((T *)nram_ping, (T *)cur_tl, (T *)nram_ping,
channels_num);
__bang_atomic_add((T *)(nram_ping + deal_num), (T *)cur_tr,
(T *)(nram_ping + deal_num), channels_num);
__bang_atomic_add((T *)(nram_ping + 2 * deal_num), (T *)cur_bl,
(T *)(nram_ping + 2 * deal_num), channels_num);
__bang_atomic_add((T *)(nram_ping + 3 * deal_num), (T *)cur_br,
(T *)(nram_ping + 3 * deal_num), channels_num);
__bang_atomic_reduce_add((T *)cur_tl, (T *)nram_ping, channels_num);
__bang_atomic_reduce_add((T *)cur_tr, (T *)(nram_ping + deal_num),
channels_num);
__bang_atomic_reduce_add((T *)cur_bl, (T *)(nram_ping + 2 * deal_num),
channels_num);
__bang_atomic_reduce_add((T *)cur_br, (T *)(nram_ping + 3 * deal_num),
channels_num);
}
// So
__bang_atomic_add((T *)ping_out, (T *)cur_bottom_input, (T *)ping_out,
channels_num);
__bang_atomic_reduce_add((T *)cur_bottom_input, (T *)ping_out,
channels_num);
// load next rem c
if (channel_loop_index + 1 < channel_loops) {
int channels_num_rem = channels_num;
Expand Down