Skip to content

Commit

Permalink
[Fix](mlu-ops): Remove deprecated instruction for 4.0 (#1187) (#1188)
Browse files Browse the repository at this point in the history
  • Loading branch information
DanieeelLiu authored Dec 30, 2024
1 parent 5c1d4ce commit 8fd98bc
Show file tree
Hide file tree
Showing 33 changed files with 156 additions and 201 deletions.
2 changes: 1 addition & 1 deletion core/runtime/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ inline int32_t getClusterNumberOfJobLimitCapability(mluOpHandle_t handle) {
inline cnrtFunctionType_t castCnKernelClassToCnrtFuncType(KernelClass jobType) {
switch (jobType) {
default:
return CNRT_FUNC_TYPE_MUTABLE;
return cnrtFuncTypeMutable;
case CN_KERNEL_CLASS_BLOCK:
return cnrtFuncTypeBlock;
case CN_KERNEL_CLASS_UNION:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ for (int i = 0; i < samples; ++i) {
int sample_idx = 0;
// 从 workspace load所有中间计算结果
T *nram_grad_gates = (T *)nram_buffer;
__bang_write_zero(nram_grad_gates, samples);
__bang_write_value(nram_grad_gates, samples, (T)0);
for (int ti = 0; ti < taskDim; ti++) {
if ((rem_task > 0) && (ti < (one_sample_task_num + 1) * rem_task)) {
sample_idx = (int)(ti / (one_sample_task_num + 1));
Expand Down Expand Up @@ -567,7 +567,7 @@ for (int i = 0; i < samples; ++i) {

// 复用nram_location空间
T *nram_grad_gates = (T*)nram_location;
__bang_write_zero(nram_grad_gates, deal_s_num);
__bang_write_value(nram_grad_gates, deal_s_num, 0);

// 三级流水计算过程
// step4
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,8 @@ void msDeformAttnCol2imBilinear(){

__memcpy(top_grad, grad_output, deal_num * sizeof(T), GDRAM2NRAM);
__bang_mul_scalar(top_grad_temp, top_grad, attn_weight, deal_num);
__bang_write_zero(grad_h_weight, deal_num);
__bang_write_zero(grad_w_weight, deal_num);
__bang_write_value(grad_h_weight, deal_num, 0);
__bang_write_value(grad_w_weight, deal_num, 0);
if (h_low >= 0 && w_low >= 0) {
const int32_t offset1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
__memcpy(grad_output_nram, grad_output + offset1, deal_num * sizeof(T), GDRAM2NRAM);
Expand Down
2 changes: 1 addition & 1 deletion docs/design_docs/points_in_boxes/points_in_boxes.md
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ void points_in_boxes_kernel(int batch_size, int boxes_num, int pts_num, const fl
X = points[0];
Y = points[m];
Z = points[2*m];
bang_write_zero(last, 0);
bang_write_value(last, 0, 0);
loop boxes for t in range(boxes_num):
boxes = b * T * 7 + t * 7;
(cx, cy, cz, dx, dy, dz, rz) = boxes[0:7];
Expand Down
2 changes: 1 addition & 1 deletion docs/design_docs/prior_box/prior_box_design_doc.md
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ mluOpPriorBox(mluOpHandle_t handle,

`one_loop_pixel_num`循环处理,一次循环只初始化一个点的`num_priors`个框的坐标,设该点在`feature_map`上的索引为`pixel_index`

- 调用__bang_write_zero()将`boxes`置为0,即 boxes = [0,0,0,0,0,0,0,0]
- 调用__bang_write_value()将`boxes`置为0,即 boxes = [0,0,0,0,0,0,0,0]
- 计算当前处理的点的位置,x_index = pixel_index % width,y_index = pixel_index / width,图中x_index = 0,1,y_index = 0,1,2。
-`x_index`(上图中x的坐标)和`x_mask`相乘,得到`tmp_x`(tmp_x为x_index,x_mask相乘的结果),tmp_x = [x_index,0,x_index,0,x_index,0,x_index,0]

Expand Down
2 changes: 1 addition & 1 deletion docs/design_docs/roipoint_pool3d/roipoint_pool3d.md
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ void roipoint_pool3d_union1(const int batch_size,
__memcpy_async(ping_input2, points_y_start + (bs_idx * pts_num) * sizeof(T), span_num_deal_size, GDRAM2NRAM);
__memcpy_async(ping_input3, points_z_start + (bs_idx * pts_num) * sizeof(T), span_num_deal_size, GDRAM2NRAM);
__memcpy_async(point_features, point_features_start, span_num_deal_size, GDRAM2NRAM);
__bang_write_zero((T *)cnt, boxes_num);
__bang_write_value((T *)cnt, boxes_num, (T)0);;

size_t box_start = bs_idx == batch_start ? first_batch_box_start : 0;
size_t box_end = bs_idx == batch_end ? last_batch_box_end : boxes_num;
Expand Down
2 changes: 1 addition & 1 deletion kernels/ball_query/ball_query_union1.mlu
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ __mlu_func__ void ballQueryWorkflow(
T *new_xyz_nram = vec_new_x1;
__memcpy(new_xyz_nram, &new_xyz[base1], num_deal_new_xyz * 3 * sizeof(T),
GDRAM2NRAM);
__bang_write_zero(vec_idx_num, num_stride);
__bang_write_value(vec_idx_num, num_stride, (int32_t)0);

for (uint32_t new_index = index_new_xyz;
new_index < (index_new_xyz + num_deal_new_xyz);) {
Expand Down
7 changes: 4 additions & 3 deletions kernels/box_iou_rotated/box_iou_rotated_aligned.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ __mlu_func__ void MLUUnion1BoxIouRotatedAligned(const T *box1, const T *box2,
const uint32_t max_box_pair =
FLOOR_ALIGN(MAX_NRAM_SIZE / copies_of_nram, COMPUTE_COUNT_ALIGN);
// First, initialize ram with all 0, or could cause nan/inf unexcepted results
__bang_write_zero((uint8_t *)nram_buffer, copies_of_nram * max_box_pair);
__bang_write_value((uint8_t *)nram_buffer, copies_of_nram * max_box_pair,
(uint8_t)0);

void *box1_trans = nram_buffer + 4 * max_box_pair * sizeof(T);
void *box2_trans =
Expand Down Expand Up @@ -224,8 +225,8 @@ __mlu_func__ void MLUUnion1BoxIouRotatedAligned(const T *box1, const T *box2,
(T *)temp4_ram, (T *)temp5_ram, actual_compute_box_num);

// initialize valid_pts, nums_in
__bang_write_zero((T *)valid_pts, 24 * actual_compute_box_num);
__bang_write_zero((T *)nums_in_ram, actual_compute_box_num);
__bang_write_value((T *)valid_pts, 24 * actual_compute_box_num, (T)0);
__bang_write_value((T *)nums_in_ram, actual_compute_box_num, (T)0);

// 3. Get all intersection points
getIntersectionPoints(
Expand Down
9 changes: 5 additions & 4 deletions kernels/box_iou_rotated/box_iou_rotated_nonaligned.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ __mlu_func__ void MLUUnion1BoxIouRotatedNonAligned(const T *box1, const T *box2,
const uint32_t max_box_pair =
FLOOR_ALIGN(MAX_NRAM_SIZE / copies_of_nram, COMPUTE_COUNT_ALIGN);
// First, initialize ram with all 0, or could cause nan/inf unexcepted results
__bang_write_zero((uint8_t *)nram_buffer, copies_of_nram * max_box_pair);
__bang_write_value((uint8_t *)nram_buffer, copies_of_nram * max_box_pair,
(uint8_t)0);

void *box1_onchip = nram_buffer + 2 * max_box_pair * sizeof(T);
void *box2_onchip =
Expand Down Expand Up @@ -190,7 +191,7 @@ __mlu_func__ void MLUUnion1BoxIouRotatedNonAligned(const T *box1, const T *box2,
const T area_thres = 1e-14;
if (area1 < area_thres) {
// set all current box-paires ious to zeros
__bang_write_zero((T *)ious_ram, actual_compute_box_num);
__bang_write_value((T *)ious_ram, actual_compute_box_num, (T)0);
__memcpy(ious + current_ious_offset, (T *)ious_ram,
actual_box2_num * sizeof(T), NRAM2GDRAM);
continue;
Expand Down Expand Up @@ -309,8 +310,8 @@ __mlu_func__ void MLUUnion1BoxIouRotatedNonAligned(const T *box1, const T *box2,
(T *)temp3_ram, (T *)temp4_ram,
actual_compute_box_num);

__bang_write_zero((T *)valid_pts, 24 * actual_compute_box_num);
__bang_write_zero((T *)nums_in_ram, actual_compute_box_num);
__bang_write_value((T *)valid_pts, 24 * actual_compute_box_num, (T)0);
__bang_write_value((T *)nums_in_ram, actual_compute_box_num, (T)0);

// 3. Get all intersection points
getIntersectionPoints(
Expand Down
10 changes: 5 additions & 5 deletions kernels/box_iou_rotated/box_iou_rotated_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -455,8 +455,8 @@ __mlu_func__ void convexHullGraham(
// if all of boxes are invalid, just return.
int valid_box_count = __bang_count((T *)valid_box, real_compute_box_num);
if (!valid_box_count) {
__bang_write_value((T *)ordered_pts_x, total_points, 0);
__bang_write_value((T *)ordered_pts_y, total_points, 0);
__bang_write_value((T *)ordered_pts_x, total_points, (T)0);
__bang_write_value((T *)ordered_pts_y, total_points, (T)0);
__bang_write_value((T *)valid_pts, actual_compute_box_num, (T)1);
__bang_write_value((T *)valid_pts + actual_compute_box_num,
total_points - actual_compute_box_num, (T)0);
Expand Down Expand Up @@ -559,8 +559,8 @@ __mlu_func__ void convexHullGraham(
// assign invalid value to temp1_ram(-2 < -1) and temp2_ram for sorting.
__bang_write_value((T *)temp1_ram, actual_compute_box_num, (T)-2);
__bang_write_value((T *)temp2_ram, actual_compute_box_num, (T)0);
__bang_write_value((T *)ordered_pts_x, total_points, 0);
__bang_write_value((T *)ordered_pts_y, total_points, 0);
__bang_write_value((T *)ordered_pts_x, total_points, (T)0);
__bang_write_value((T *)ordered_pts_y, total_points, (T)0);

// get the offset of each max value according to the channel
__mluop_get_stage_indices_tfuse((int *)temp3_ram, actual_compute_box_num);
Expand Down Expand Up @@ -783,7 +783,7 @@ __mlu_func__ void polygonArea(T *ordered_pts_x, T *ordered_pts_y, T *valid_box,
actual_compute_box_num);

// temp1 = area, initialize with all 0
__bang_write_zero((T *)temp1_ram, actual_compute_box_num);
__bang_write_value((T *)temp1_ram, actual_compute_box_num, (T)0);
__bang_argmax((T *)temp6_ram, (T *)nums_in_ram, actual_compute_box_num);

// temp_nums_in = max(nums_in)
Expand Down
16 changes: 8 additions & 8 deletions kernels/carafe/carafe_block.mlu
Original file line number Diff line number Diff line change
Expand Up @@ -333,17 +333,17 @@ __mlu_global__ void MLUKernelCarafeBackward(T *input, T *mask, T *grad_output,
h_k * wo * group * k_up * k_up +
w_k * group * k_up * k_up + group_k * k_up * k_up;

__bang_write_zero((T *)nram_buf + 2 * NRAM_BLOCK / sizeof(T),
NRAM_BLOCK / sizeof(T));
__bang_write_zero((T *)nram_buf + 4 * NRAM_BLOCK / sizeof(T),
NRAM_BLOCK / sizeof(T));
__bang_write_zero((T *)nram_buf + 3 * NRAM_BLOCK / sizeof(T),
NRAM_BLOCK / sizeof(T));
__bang_write_value((T *)nram_buf + 2 * NRAM_BLOCK / sizeof(T),
NRAM_BLOCK / sizeof(T), (T)0);
__bang_write_value((T *)nram_buf + 4 * NRAM_BLOCK / sizeof(T),
NRAM_BLOCK / sizeof(T), (T)0);
__bang_write_value((T *)nram_buf + 3 * NRAM_BLOCK / sizeof(T),
NRAM_BLOCK / sizeof(T), (T)0);

__memcpy((T *)nram_buf + NRAM_BLOCK / sizeof(T), (T *)base_mask,
k_up * k_up * sizeof(T), GDRAM2NRAM);
for (int i = 0; i < num_per_loop; i++) {
__bang_write_zero((T *)nram_buf, NRAM_BLOCK / sizeof(T));
__bang_write_value((T *)nram_buf, NRAM_BLOCK / sizeof(T), (T)0);
T *base_grad_output = (T *)grad_output + n_k * ho * wo * c +
h_k * wo * c + w_k * c + group_k * group_size +
i * num_align;
Expand Down Expand Up @@ -386,7 +386,7 @@ __mlu_global__ void MLUKernelCarafeBackward(T *input, T *mask, T *grad_output,
}
}
if (rem_for_loop) {
__bang_write_zero((T *)nram_buf, NRAM_BLOCK / sizeof(T));
__bang_write_value((T *)nram_buf, NRAM_BLOCK / sizeof(T), (T)0);
T *base_grad_output = (T *)grad_output + n_k * ho * wo * c +
h_k * wo * c + w_k * c + group_k * group_size +
num_per_loop * num_align;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ static __mlu_func__ void compute(int8_t *nram_vertices, int8_t *nram_mask,
deal_num * dim_m);

// preprocess to get pad index
__bang_write_zero(nram_temp0, dim_m);
__bang_write_value(nram_temp0, dim_m, (T)0);
__bang_write_value(nram_temp0, INTERSECTION_OFFSET, (T)1.0);
__bang_int82float(nram_pad, (int8_t *)(nram_mask_p), deal_num * dim_m, 0);
__bang_cycle_maxequal(nram_pad, nram_pad, nram_temp0, deal_num * dim_m,
Expand Down
2 changes: 1 addition & 1 deletion kernels/lgamma/lgamma_block.mlu
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ __mlu_func__ void calcLgamma(float *buf0, float *buf1, float *buf2, float *buf3,
* reflection_denom - lgamma_x : -reflection_denom;
*/
// using buf3 -> reflection
__bang_write_zero(buf4, num_deal);
__bang_write_value(buf4, num_deal, (float)0);
__bang_sub(buf2, buf4, buf2, num_deal);
isFinite(buf4, buf2, num_deal);
__bang_sub(buf3, buf2, buf1, num_deal);
Expand Down
2 changes: 1 addition & 1 deletion kernels/logspace/logspace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

static void LogspacePolicyFunc(const mluOpHandle_t &handle, const int64_t steps,
cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
*k_type = CNRT_FUNC_TYPE_BLOCK;
*k_type = cnrtFuncTypeBlock;
uint32_t cluster_num =
mluop::runtime::getCoreNumOfEachUnionCapability(handle);
uint32_t core_in_cluster = handle->core_num_per_cluster;
Expand Down
22 changes: 12 additions & 10 deletions kernels/logspace/logspace_block.mlu
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,16 @@ __mlu_func__ void float2DifferentType(float *result_float, T *result,
__bang_lut((int16_t *)result_ge_half_max, (uint16_t *)result_ge_half_max,
(int16_t *)table_half_all1, num, LUT_TABEL_LENGTH);
__bang_float2half_tz((half *)result, result_float, num);
__bang_bor((int16_t *)result, (int16_t *)result,
(int16_t *)result_ge_half_max, num);
__bang_bor((int8_t *)result, (int8_t *)result, (int8_t *)result_ge_half_max,
2 * num);
__bang_ge_scalar((int16_t *)result_ge_half_max,
(int16_t *)result_ge_half_max, 1, num);
__nram__ int16_t table_half_inf[LUT_TABEL_LENGTH] = {(int16_t)0xffff,
(int16_t)0xfc00};
__bang_lut((int16_t *)result_ge_half_max, (uint16_t *)result_ge_half_max,
(int16_t *)table_half_inf, num, LUT_TABEL_LENGTH);
__bang_band((int16_t *)result, (int16_t *)result,
(int16_t *)result_ge_half_max, num);
__bang_band((int8_t *)result, (int8_t *)result,
(int8_t *)result_ge_half_max, 2 * num);
}
if (std::is_same<T, int>::value) {
__cn_vector_cast_f32_to_s32(num, (int *)result, result_float);
Expand Down Expand Up @@ -236,8 +236,10 @@ __mlu_func__ void dealBaseNegative(const float start, const float end,
__bang_float2int32((int *)floor_y, floor_y, actual_deal_num, 0);
__bang_move(y_copy, log2_result, sizeof(float) * actual_deal_num);
__bang_float2int32((int *)y_copy, y_copy, actual_deal_num, 0);
__bang_band((int *)y_copy, (int *)y_copy, all_int_1, actual_deal_num);
__bang_band((int *)y_copy, (int *)y_copy, (int *)floor_y, actual_deal_num);
__bang_band((int8_t *)y_copy, (int8_t *)y_copy, (int8_t *)all_int_1,
4 * actual_deal_num);
__bang_band((int8_t *)y_copy, (int8_t *)y_copy, (int8_t *)floor_y,
4 * actual_deal_num);
__nram__ uint32_t table_for_odd_or_even_power[LUT_TABEL_LENGTH] = {
0, 0x80000000};
__bang_lut((int32_t *)y_copy, (uint32_t *)y_copy,
Expand All @@ -247,12 +249,12 @@ __mlu_func__ void dealBaseNegative(const float start, const float end,
__bang_lut((int32_t *)floor_y, (uint32_t *)floor_y,
(int32_t *)table_for_integer_power, actual_deal_num,
LUT_TABEL_LENGTH);
__bang_bor((int *)log2_result, (int *)log2_result, (int *)floor_y,
actual_deal_num);
__bang_bor((int8_t *)log2_result, (int8_t *)log2_result, (int8_t *)floor_y,
4 * actual_deal_num);
__bang_mul_scalar(log2_result, log2_result, base_log, actual_deal_num);
__bang_pow2(result_float, log2_result, actual_deal_num);
__bang_bor((int *)result_float, (int *)result_float, (int *)y_copy,
actual_deal_num);
__bang_bor((int8_t *)result_float, (int8_t *)result_float, (int8_t *)y_copy,
4 * actual_deal_num);
float2DifferentType(result_float, result, actual_deal_num);
__memcpy(res + loop_offset, result, actual_deal_num * sizeof(T),
NRAM2GDRAM);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ __mlu_global__ void MLUKernelMoeDispatchBwdGate1(

if ((samples < taskDim) && (taskId == 0)) {
T *nram_grad_gates = (T *)nram_buffer;
__bang_write_zero(nram_grad_gates, samples);
__bang_write_value(nram_grad_gates, samples, (T)0);

if (samples > 1) {
int one_sample_task_num = taskDim / samples;
Expand Down Expand Up @@ -285,7 +285,7 @@ __mlu_global__ void MLUKernelMoeDispatchBwdGate2(
__bang_and(nram_mask, nram_mask, nram_indices, deal_s_num);

T *nram_grad_gates = (T *)nram_indices;
__bang_write_zero(nram_grad_gates, deal_s_num);
__bang_write_value(nram_grad_gates, deal_s_num, 0);

if (deal_s_num > 1) {
T *base_dispatch_addr = (T *)dispatch;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ void __mlu_func__ loadValue(
b_col * spatial_size * qid_stride + level_start_id * qid_stride;
}
#endif
__bang_write_zero(grad_temp1, deal_num_real * num_deal_grid);
__bang_write_value(grad_temp1, deal_num_real * num_deal_grid, (float)0);
__bang_cycle_add(grad_temp1, grad_temp1, mask2, deal_num_real * num_deal_grid,
num_deal_grid);
__bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid);
Expand All @@ -379,7 +379,7 @@ void __mlu_func__ loadValue(
num_deal_grid * deal_num_real, 0);
__bang_lut((int32_t *)grad_temp3, (uint32_t *)grad_temp3, (int32_t *)table,
num_deal_grid * deal_num_real, 64);
__bang_write_zero(grad_temp1, deal_num_real * num_deal_grid);
__bang_write_value(grad_temp1, deal_num_real * num_deal_grid, (float)0);
__bang_cycle_add(grad_temp1, grad_temp1, mask1, deal_num_real * num_deal_grid,
num_deal_grid);
__sync_io_move_compute();
Expand All @@ -397,7 +397,7 @@ void __mlu_func__ loadValue(
(int8_t *)grad_temp3,
num_deal_grid * deal_num_real * sizeof(float));

__bang_write_zero(grad_temp1, deal_num_real * num_deal_grid);
__bang_write_value(grad_temp1, deal_num_real * num_deal_grid, (float)0);
__bang_cycle_add(grad_temp1, grad_temp1, mask4, deal_num_real * num_deal_grid,
num_deal_grid);
__bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid);
Expand All @@ -410,7 +410,7 @@ void __mlu_func__ loadValue(
(int8_t *)grad_temp3,
num_deal_grid * deal_num_real * sizeof(float));

__bang_write_zero(grad_temp1, deal_num_real * num_deal_grid);
__bang_write_value(grad_temp1, deal_num_real * num_deal_grid, (float)0);
__bang_cycle_add(grad_temp1, grad_temp1, mask3, deal_num_real * num_deal_grid,
num_deal_grid);
__bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid);
Expand Down Expand Up @@ -438,7 +438,7 @@ void __mlu_func__ computeGradValue(
float *nram_grid_offset2, const int32_t &batch, float *nram_grad_output_tl,
float *nram_grad_output_tr, float *nram_grad_output_bl,
float *nram_grad_output_br, float *nram_grad_weight) {
__bang_write_zero(grad_temp1, deal_num_real * num_deal_grid);
__bang_write_value(grad_temp1, deal_num_real * num_deal_grid, (float)0);
__bang_cycle_add(grad_temp1, grad_temp1, nram_grad_weight,
deal_num_real * num_deal_grid, num_deal_grid);
__bang_transpose(grad_temp3, grad_temp1,
Expand Down Expand Up @@ -600,7 +600,7 @@ void __mlu_func__ computeGradAttnWeight(
const int32_t &num_per_time_real, const int32_t &num_heads,
const int32_t &num_levels, const int32_t &num_points,
const int32_t &grid_offset, float *nram_h_high_temp) {
__bang_write_zero(grad_w_weight, 2 * offset_nram);
__bang_write_value(grad_w_weight, 2 * offset_nram, (float)0);
// grad_output_nram_tl
__bang_transpose(grad_weight, nram_grad_output_tl, num_deal_grid,
deal_num_real);
Expand Down Expand Up @@ -714,7 +714,7 @@ void __mlu_func__ computeGradSampingLoc(
num_points * deal_num_real,
num_per_time_real * num_heads * num_levels);

__bang_write_zero(grad_temp1, num_deal_grid * deal_num_real);
__bang_write_value(grad_temp1, num_deal_grid * deal_num_real, (float)0);
__bang_cycle_add(grad_temp1, grad_temp1, nram_grad_weight,
num_deal_grid * deal_num_real, num_deal_grid);
__bang_transpose(nram_grad_output_tr, grad_temp1,
Expand Down
Loading

0 comments on commit 8fd98bc

Please sign in to comment.