open-mmlab · momo609 · May 13, 2024 · Jun 19, 2024 · Jun 18, 2024 · Jun 19, 2024
diff --git a/mmcv/ops/csrc/pytorch/npu/assign_score_withk_npu.cpp b/mmcv/ops/csrc/pytorch/npu/assign_score_withk_npu.cpp
@@ -0,0 +1,44 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void assign_score_withk_forward_npu(int B, int N0, int N1, int M, int K, int O,
+                                    int aggregate, const Tensor& points,
+                                    const Tensor& centers,
+                                    const Tensor& scores,
+                                    const Tensor& knn_idx, Tensor& output) {
+    at::Tensor points_trans = points.permute({0, 3, 1, 2});
+    at::Tensor centers_trans = centers.permute({0, 3, 1, 2});
+
+    EXEC_NPU_CMD(aclnnAssignScoreWithk, points_trans, centers_trans, scores, knn_idx, B, N0, N1, M, K, O, aggregate, output);
+}
+
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output);
+
+REGISTER_NPU_IMPL(assign_score_withk_forward_impl, assign_score_withk_forward_npu);
+
+
+void assign_score_withk_backward_npu(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+
+    at::Tensor grad_out_trans = grad_out.permute({0, 2, 3, 1});
+
+    EXEC_NPU_CMD(aclnnAssignScoreWithkGrad, grad_out_trans, points, centers, scores, knn_idx, B, N0, N1, M, K, O, aggregate, grad_scores, grad_points, grad_centers);
+}
+
+void assign_score_withk_backward_impl(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores);
+
+REGISTER_NPU_IMPL(assign_score_withk_backward_impl, assign_score_withk_backward_npu);
+
diff --git a/mmcv/ops/csrc/pytorch/npu/border_align_npu.cpp b/mmcv/ops/csrc/pytorch/npu/border_align_npu.cpp
@@ -0,0 +1,53 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void border_align_forward_impl(const Tensor &input, const Tensor &boxes, Tensor output,
+                               Tensor argmax_idx, const int pool_size);
+
+void border_align_forward_npu(const Tensor &input, const Tensor &boxes, Tensor output,
+                              Tensor argmax_idx, const int pool_size){
+    TORCH_CHECK(input.size(0) == boxes.size(0), "The batch sizes of feature map and rois must be the same.");
+    TORCH_CHECK(input.size(1) % 4 == 0, "The number of channels must be divisible by 4.");
+    TORCH_CHECK(pool_size >= 2, "The pool size should be larger than 2.");
+    int32_t batch_size = input.size(0);
+    int32_t channels = input.size(1);
+    int32_t height = input.size(2);
+    int32_t width = input.size(3);
+    at::Tensor feature_map = input.permute({0, 2, 3, 1}).contiguous();
+    at::Tensor rois_map = boxes.contiguous();
+    at::Tensor temp_tensor = at::zeros({batch_size, height * width, pool_size + 1, channels}, input.options());
+    EXEC_NPU_CMD(aclnnBorderAlign, feature_map, rois_map, pool_size, temp_tensor);
+    auto max_result = temp_tensor.max(-2);
+    at::Tensor output_ = std::get<0>(max_result).to(at::kFloat);
+    output_ = output_.reshape({batch_size, height * width, 4, channels / 4}).permute({0, 3, 1, 2}).contiguous();
+    output.copy_(output_);
+    at::Tensor argmax_idx_ = std::get<1>(max_result).to(at::kInt);
+    argmax_idx_ = argmax_idx_.reshape({batch_size, height * width, 4, channels / 4}).permute({0, 3, 1, 2}).contiguous();
+    argmax_idx.copy_(argmax_idx_);
+}
+REGISTER_NPU_IMPL(border_align_forward_impl, border_align_forward_npu);
+
+
+void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
+                                const Tensor &argmax_idx, Tensor grad_input,
+                                const int pool_size);
+
+void border_align_backward_npu(const Tensor &grad_output, const Tensor &boxes,
+                               const Tensor &argmax_idx, Tensor grad_input,
+                               const int pool_size){
+    TORCH_CHECK(grad_output.dim() == 4, "grad_out.dim() must be 4, but got: ", grad_output.dim());
+    TORCH_CHECK(boxes.dim() == 3, "idx.dim() must be 3, but got: ", boxes.dim());
+    TORCH_CHECK(argmax_idx.dim() == 4, "argmax_idx.dim() must be 4, but got: ", argmax_idx.dim());
+
+    int32_t batch_size = grad_output.size(0);
+    int32_t feat_channels = grad_output.size(1) * 4;
+    int32_t channels = grad_output.size(1);
+    int32_t box_size = boxes.size(1);
+    int32_t height = grad_input.size(2);
+    int32_t width = grad_input.size(3);
+
+    EXEC_NPU_CMD(aclnnBorderAlignGrad, grad_output, boxes, argmax_idx, channels, box_size, height, width, pool_size, batch_size, grad_input);
+}
+REGISTER_NPU_IMPL(border_align_backward_impl, border_align_backward_npu);
diff --git a/mmcv/ops/csrc/pytorch/npu/box_iou_quadri_npu.cpp b/mmcv/ops/csrc/pytorch/npu/box_iou_quadri_npu.cpp
@@ -4,10 +4,11 @@ using namespace NPU_NAME_SPACE;
 using namespace std;
 
 void box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                         const int mode_flag, const bool aligned);
+                          const int mode_flag, const bool aligned);
 
 void box_iou_quadri_npu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                        const int mode_flag, const bool aligned) {
+                         const int mode_flag, const bool aligned) {
+
   TORCH_CHECK(boxes1.size(1) == 8, "boxes1 must be 2D tensor (N, 8)");
   TORCH_CHECK(boxes1.size(1) == 8, "boxes1 must be 2D tensor (N, 8)");
 

diff --git a/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp b/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp
@@ -8,13 +8,14 @@ void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
 
 void box_iou_rotated_npu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                          const int mode_flag, const bool aligned) {
+
   TORCH_CHECK(boxes1.size(1) == 5, "boxes1 must be 2D tensor (N, 5)");
   TORCH_CHECK(boxes1.size(1) == 5, "boxes1 must be 2D tensor (N, 5)");
 
   auto trans = false;
   auto is_clockwise = false;
   EXEC_NPU_CMD(aclnnBoxesOverlapBev, boxes1, boxes2, trans, is_clockwise,
-               aligned, mode_flag, ious);
+                aligned, mode_flag, ious);
   return;
 }
 

diff --git a/mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp b/mmcv/ops/csrc/pytorch/npu/boxes_overlap_bev_npu.cpp
@@ -10,17 +10,16 @@ void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
 void iou3d_boxes_overlap_bev_forward_npu(const int num_a, const Tensor boxes_a,
                                          const int num_b, const Tensor boxes_b,
                                          Tensor ans_overlap) {
-  TORCH_CHECK(boxes_a.size(1) == 7, "boxes_a must be 2D tensor (N, 7)");
-  TORCH_CHECK(boxes_b.size(1) == 7, "boxes_b must be 2D tensor (N, 7)");
 
-  auto trans = false;
-  auto is_clockwise = false;
-  auto aligned = false;
-  auto mode_flag = 2;
-  EXEC_NPU_CMD(aclnnBoxesOverlapBev, boxes_a, boxes_b, trans, is_clockwise,
-               aligned, mode_flag, ans_overlap);
-  return;
+    TORCH_CHECK(boxes_a.size(1) == 7, "boxes_a must be 2D tensor (N, 7)");
+    TORCH_CHECK(boxes_b.size(1) == 7, "boxes_b must be 2D tensor (N, 7)");
+
+    auto trans = false;
+    auto is_clockwise = false;
+    auto aligned = false;
+    auto mode_flag = 2;
+    EXEC_NPU_CMD(aclnnBoxesOverlapBev, boxes_a, boxes_b, trans, is_clockwise, aligned, mode_flag, ans_overlap);
+    return;
 }
 
-REGISTER_NPU_IMPL(iou3d_boxes_overlap_bev_forward_impl,
-                  iou3d_boxes_overlap_bev_forward_npu);
+REGISTER_NPU_IMPL(iou3d_boxes_overlap_bev_forward_impl, iou3d_boxes_overlap_bev_forward_npu);
diff --git a/mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp b/mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp
@@ -5,19 +5,34 @@ using namespace std;
 
 void chamfer_distance_forward_npu(Tensor XYZ1, Tensor XYZ2, Tensor dist1,
                                   Tensor dist2, Tensor idx1, Tensor idx2) {
+  bool is_half = XYZ1.scalar_type() == at::kHalf;
   at::Tensor xyz1 = at::ones_like(XYZ1);
   at::Tensor xyz2 = at::ones_like(XYZ2);
+  at::Tensor distf1 = at::ones_like(dist1);
+  at::Tensor distf2 = at::ones_like(dist2);
   xyz1 = XYZ1.transpose(1, 2).transpose(0, 1);
   xyz2 = XYZ2.transpose(1, 2).transpose(0, 1);
+  if (is_half) {
+    xyz1 = xyz1.to(at::kFloat);
+    xyz2 = xyz2.to(at::kFloat);
+    distf1 = dist1.to(at::kFloat);
+    distf2 = dist2.to(at::kFloat);
+  }
   OpCommand cmd;
   cmd.Name("ChamferDistance")
       .Input(xyz1)
       .Input(xyz2)
-      .Output(dist1)
-      .Output(dist2)
+      .Output(distf1)
+      .Output(distf2)
       .Output(idx1)
       .Output(idx2)
       .Run();
+  if (is_half) {
+    distf1 = distf1.to(at::kHalf);
+    distf2 = distf2.to(at::kHalf);
+  }
+  dist1.copy_(distf1);
+  dist2.copy_(distf2);
 }
 
 void chamfer_distance_backward_npu(Tensor xyz1, Tensor xyz2, Tensor idx1,

diff --git a/mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp b/mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp
@@ -53,7 +53,7 @@ void deform_roi_pool_backward_npu(Tensor grad_output, Tensor input, Tensor rois,
       .Output(grad_offset)
       .Attr("output_size", output_size)
       .Attr("spatial_scale", spatial_scale)
-      .Attr("sample_ratio", sampling_ratio_)
+      .Attr("sampling_ratio", sampling_ratio_)
       .Attr("gamma", gamma)
       .Run();
 }

diff --git a/mmcv/ops/csrc/pytorch/npu/diff_iou_rotated_npu.cpp b/mmcv/ops/csrc/pytorch/npu/diff_iou_rotated_npu.cpp
@@ -0,0 +1,28 @@
+#include "pytorch_npu_helper.hpp"
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+Tensor diff_iou_rotated_sort_vertices_npu(Tensor vertices,
+                                          Tensor mask,
+                                          Tensor num_valid) {
+    TORCH_CHECK(vertices.dim() == 4, "vertices must be a 4D Tensor, but got: ", vertices.dim());
+    TORCH_CHECK(mask.dim() == 3, "mask must be a 3D Tensor, but got: ", mask.dim());
+    TORCH_CHECK(num_valid.dim() == 2, "num_valid must be a 2D Tensor, but got: ", num_valid.dim());
+
+    uint32_t B = vertices.size(0);
+    uint32_t N = vertices.size(1);
+
+    at::Tensor sortedIdx = at::empty({B, N, 9}, num_valid.options());
+    at::Tensor mask_fp = mask.to(at::kFloat);
+
+    EXEC_NPU_CMD(aclnnDiffIouRotatedSortVertices, vertices, mask_fp, num_valid, sortedIdx);
+
+    return sortedIdx;
+}
+
+Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices,
+                                                   Tensor mask,
+                                                   Tensor num_valid);
+
+REGISTER_NPU_IMPL(diff_iou_rotated_sort_vertices_forward_impl,
+    diff_iou_rotated_sort_vertices_npu);