intermediate review

umrover · Dec 2, 2024 · 8bff333 · 8bff333
1 parent fbab60e
commit 8bff333
Show file tree

Hide file tree

Showing 7 changed files with 46 additions and 219 deletions.
diff --git a/perception/object_detector/object_detector.cpp b/perception/object_detector/object_detector.cpp
@@ -1,11 +1,10 @@
 #include "object_detector.hpp"
+#include <functional>
 
 namespace mrover {
 
     ObjectDetectorBase::ObjectDetectorBase(rclcpp::NodeOptions const& options) : rclcpp::Node(NODE_NAME, options), mLoopProfiler{get_logger()} {
         std::string modelName;
-        float modelScoreThreshold{};
-        float modelNMSThreshold{};
 
         std::vector<ParameterWrapper> params{
                 {"camera_frame", mCameraFrame, "zed_left_camera_frame"},
@@ -15,8 +14,8 @@ namespace mrover {
                 {"hitcount_threshold", mObjHitThreshold, 5},
                 {"hitcount_max", mObjMaxHitcount, 10},
                 {"model_name", modelName, "Large-Dataset"},
-                {"model_score_threshold", modelScoreThreshold, 0.75},
-                {"model_nms_threshold", modelNMSThreshold, 0.5}};
+                {"model_score_threshold", mModelScoreThreshold, 0.75},
+                {"model_nms_threshold", mModelNMSThreshold, 0.5}};
 
         ParameterWrapper::declareParameters(this, params);
 
@@ -31,9 +30,11 @@ namespace mrover {
         // Initialize TensorRT Inference Object and Get Important Output Information
         mTensorRT = TensortRT{modelName, packagePath.string()};
 
-        mModel = Model(modelName, {0, 0}, {"bottle", "hammer"}, mTensorRT.getInputTensorSize(), mTensorRT.getOutputTensorSize(), {modelScoreThreshold, modelNMSThreshold}, ObjectDetectorBase::preprocessYOLOv8Input, ObjectDetectorBase::parseYOLOv8Output);
+        using namespace std::placeholders;
 
-        RCLCPP_INFO_STREAM(get_logger(), std::format("Object detector initialized with model: {} and thresholds: {} and {}", mModel.modelName, modelScoreThreshold, modelNMSThreshold));
+        mModel = Model(modelName, {0, 0}, {"bottle", "hammer"}, mTensorRT.getInputTensorSize(), mTensorRT.getOutputTensorSize(), [this](Model const& model, cv::Mat& rgbImage, cv::Mat& blobSizedImage, cv::Mat& blob){preprocessYOLOv8Input(model, rgbImage, blobSizedImage, blob);}, [this](Model const& model, cv::Mat& output, std::vector<Detection>& detections){parseYOLOv8Output(model, output, detections);});
+
+        RCLCPP_INFO_STREAM(get_logger(), std::format("Object detector initialized with model: {} and thresholds: {} and {}", mModel.modelName, mModelScoreThreshold, mModelNMSThreshold));
     }
 
     StereoObjectDetector::StereoObjectDetector(rclcpp::NodeOptions const& options) : ObjectDetectorBase(options) {

diff --git a/perception/object_detector/object_detector.hpp b/perception/object_detector/object_detector.hpp
@@ -5,8 +5,16 @@
 namespace mrover {
 
     class ObjectDetectorBase : public rclcpp::Node {
-
     protected:
+        struct Detection {
+            int classId{};
+            std::string className;
+            float confidence{};
+            cv::Rect box;
+
+            Detection(int _classId, std::string _className, float _confidence, cv::Rect _box) : classId{_classId}, className{_className}, confidence{_confidence}, box{_box}{}
+        };
+
         struct Model {
             std::string modelName;
 
@@ -18,20 +26,19 @@ namespace mrover {
 
             std::vector<int64_t> outputTensorSize;
 
-            // Additional space for params that one algorithm might need and not others
-            std::vector<float> buffer;
-
             // Converts an rgb image to a blob
-            std::function<void(Model const&, cv::Mat&, cv::Mat&, cv::Mat&)> rbgImageToBlob;
+            std::function<void(Model const&, cv::Mat&, cv::Mat&, cv::Mat&)> rgbImageToBlob;
 
             // Converts an output tensor into a vector of detections
             std::function<void(Model const&, cv::Mat&, std::vector<Detection>&)> outputTensorToDetections;
 
             Model() = default;
 
-            Model(std::string _modelName, std::vector<int> _objectHitCounts, std::vector<std::string> _classes, std::vector<int64_t> _inputTensorSize, std::vector<int64_t> _outputTensorSize, std::vector<float> _buffer, std::function<void(Model const&, cv::Mat&, cv::Mat&, cv::Mat&)> _rbgImageToBlob, std::function<void(Model const&, cv::Mat&, std::vector<Detection>&)> _outputTensorToDetections) : modelName{std::move(_modelName)}, objectHitCounts(std::move(_objectHitCounts)), classes(std::move(_classes)), inputTensorSize(std::move(_inputTensorSize)), outputTensorSize(std::move(_outputTensorSize)), buffer(std::move(_buffer)), rbgImageToBlob{std::move(_rbgImageToBlob)}, outputTensorToDetections{std::move(_outputTensorToDetections)} {}
+            Model(std::string _modelName, std::vector<int> _objectHitCounts, std::vector<std::string> _classes, std::vector<int64_t> _inputTensorSize, std::vector<int64_t> _outputTensorSize, std::function<void(Model const&, cv::Mat&, cv::Mat&, cv::Mat&)> _rbgImageToBlob, std::function<void(Model const&, cv::Mat&, std::vector<Detection>&)> _outputTensorToDetections) : modelName{std::move(_modelName)}, objectHitCounts(std::move(_objectHitCounts)), classes(std::move(_classes)), inputTensorSize(std::move(_inputTensorSize)), outputTensorSize(std::move(_outputTensorSize)), rgbImageToBlob{std::move(_rbgImageToBlob)}, outputTensorToDetections{std::move(_outputTensorToDetections)} {}
         };
 
+
+
         static constexpr char const* NODE_NAME = "object_detector";
 
         std::unique_ptr<tf2_ros::Buffer> mTfBuffer = std::make_unique<tf2_ros::Buffer>(get_clock());
@@ -57,6 +64,8 @@ namespace mrover {
         int mObjDecrementWeight{};
         int mObjHitThreshold{};
         int mObjMaxHitcount{};
+        float mModelScoreThreshold{};
+        float mModelNMSThreshold{};
 
         auto spiralSearchForValidPoint(sensor_msgs::msg::PointCloud2::UniquePtr const& cloudPtr,
                                        std::size_t u, std::size_t v,
@@ -66,15 +75,15 @@ namespace mrover {
                               std::span<Detection const> detections,
                               cv::Size const& imageSize = {640, 640}) -> void;
 
-        auto publishDetectedObjects(cv::InputArray image) -> void;
-
-        static auto drawDetectionBoxes(cv::InputOutputArray image, std::span<Detection const> detections) -> void;
+        auto publishDetectedObjects(cv::InputArray const& image) -> void;
 
-        auto static parseYOLOv8Output(Model const& model,
+        static auto drawDetectionBoxes(cv::InputOutputArray& image, std::span<Detection const> detections) -> void;
+
+        auto parseYOLOv8Output(Model const& model,
                                       cv::Mat& output,
-                                      std::vector<Detection>& detections) -> void;
+                                      std::vector<Detection>& detections) const -> void;
 
-        auto static preprocessYOLOv8Input(Model const& model, cv::Mat& rgbImage, cv::Mat& blobSizedImage, cv::Mat& blob) -> void;
+        static auto preprocessYOLOv8Input(Model const& model, cv::Mat const& rgbImage, cv::Mat& blobSizedImage, cv::Mat& blob) -> void;
 
     public:
         explicit ObjectDetectorBase(rclcpp::NodeOptions const& options = rclcpp::NodeOptions());
@@ -105,7 +114,7 @@ namespace mrover {
     public:
         explicit ImageObjectDetector(rclcpp::NodeOptions const& options = rclcpp::NodeOptions());
 
-        auto getTagBearing(cv::InputArray image, cv::Rect const& box) const -> float;
+        auto getTagBearing(cv::InputArray const& image, cv::Rect const& box) const -> float;
 
         auto imageCallback(sensor_msgs::msg::Image::UniquePtr const& msg) -> void;
     };

diff --git a/perception/object_detector/object_detector.parsing.cpp b/perception/object_detector/object_detector.parsing.cpp
@@ -1,22 +1,18 @@
 #include "object_detector.hpp"
 namespace mrover {
-    auto ObjectDetectorBase::preprocessYOLOv8Input(Model const& model, cv::Mat& rgbImage, cv::Mat& blobSizedImage, cv::Mat& blob) -> void {
+    auto ObjectDetectorBase::preprocessYOLOv8Input(Model const& model, cv::Mat const& rgbImage, cv::Mat& blobSizedImage, cv::Mat& blob) -> void {
         if (model.inputTensorSize.size() != 4) {
             throw std::runtime_error("Expected Blob Size to be of size 4, are you using the correct model type?");
         }
 
-        if (model.buffer.size() != 2) {
-            throw std::runtime_error("Expected 2 additional parameters!");
-        }
-
         static constexpr double UCHAR_TO_DOUBLE = 1.0 / 255.0;
 
         cv::Size blobSize{static_cast<int32_t>(model.inputTensorSize[2]), static_cast<int32_t>(model.inputTensorSize[3])};
         cv::resize(rgbImage, blobSizedImage, blobSize);
         cv::dnn::blobFromImage(blobSizedImage, blob, UCHAR_TO_DOUBLE, blobSize, cv::Scalar{}, false, false);
     }
 
-    auto ObjectDetectorBase::parseYOLOv8Output(Model const& model, cv::Mat& output, std::vector<Detection>& detections) -> void {
+    auto ObjectDetectorBase::parseYOLOv8Output(Model const& model, cv::Mat& output, std::vector<Detection>& detections) const -> void {
         // Parse model specific dimensioning from the output
 
         // The input to this function is expecting a YOLOv8 style model, thus the dimensions should be > rows
@@ -26,12 +22,12 @@ namespace mrover {
             throw std::runtime_error(message.data());
         }
 
-        int rows = output.cols;
-        int dimensions = output.rows;
+        int numCandidates = output.cols;
+        int predictionDimension = output.rows;
 
         // The output of the model is a batchSizex84x8400 matrix
         // This converts the model to a TODO: Check this dimensioning
-        output = output.reshape(1, dimensions);
+        output = output.reshape(1, predictionDimension);
         cv::transpose(output, output);
 
         // This function expects the image to already be in the correct format thus no distrotion is needed
@@ -44,16 +40,16 @@ namespace mrover {
         std::vector<cv::Rect> boxes;
 
         // Each row of the output is a detection with a bounding box and associated class scores
-        for (int r = 0; r < rows; ++r) {
+        for (int r = 0; r < numCandidates; ++r) {
             // Skip first four values as they are the box data
-            cv::Mat scores = output.row(r).colRange(4, dimensions);
+            cv::Mat scores = output.row(r).colRange(4, predictionDimension);
 
             cv::Point classId;
             double maxClassScore;
             cv::minMaxLoc(scores, nullptr, &maxClassScore, nullptr, &classId);
 
             // Determine if the class is an acceptable confidence level, otherwise disregard
-            if (maxClassScore <= model.buffer[0]) continue;
+            if (maxClassScore <= mModelScoreThreshold) continue;
 
             confidences.push_back(static_cast<float>(maxClassScore));
             classIds.push_back(classId.x);
@@ -77,22 +73,12 @@ namespace mrover {
 
         //Coalesce the boxes into a smaller number of distinct boxes
         std::vector<int> nmsResult;
-        cv::dnn::NMSBoxes(boxes, confidences, model.buffer[0], model.buffer[1], nmsResult);
+        cv::dnn::NMSBoxes(boxes, confidences, mModelScoreThreshold, mModelNMSThreshold, nmsResult);
 
         // Fill in the output Detections Vector
         for (int i: nmsResult) {
-            Detection result;
-
-            //Fill in the id and confidence for the class
-            result.classId = classIds[i];
-            result.confidence = confidences[i];
-
-            //Fill in the class name and box information
-            result.className = model.classes[result.classId];
-            result.box = boxes[i];
-
-            //Push back the detection into the for storagevector
-            detections.push_back(result);
+            //Push back the detection into the for storage vector
+            detections.emplace_back(classIds[i], model.classes[classIds[i]], confidences[i], boxes[i]);
         }
     }
 

diff --git a/perception/object_detector/object_detector.processing.cpp b/perception/object_detector/object_detector.processing.cpp
@@ -24,7 +24,7 @@ namespace mrover {
 
         // Convert the RGB Image into the blob Image format
         cv::Mat blobSizedImage;
-        mModel.rbgImageToBlob(mModel, mRgbImage, blobSizedImage, mImageBlob);
+        mModel.rgbImageToBlob(mModel, mRgbImage, blobSizedImage, mImageBlob);
 
         mLoopProfiler.measureEvent("Conversion");
 
@@ -151,7 +151,7 @@ namespace mrover {
         }
     }
 
-    auto ObjectDetectorBase::publishDetectedObjects(cv::InputArray image) -> void {
+    auto ObjectDetectorBase::publishDetectedObjects(cv::InputArray const& image) -> void {
         mDetectionsImageMessage.header.stamp = get_clock()->now();
         mDetectionsImageMessage.height = image.rows();
         mDetectionsImageMessage.width = image.cols();
@@ -189,7 +189,7 @@ namespace mrover {
 
         // Convert the RGB Image into the blob Image format
         cv::Mat blobSizedImage;
-        mModel.rbgImageToBlob(mModel, mRgbImage, blobSizedImage, mImageBlob);
+        mModel.rgbImageToBlob(mModel, mRgbImage, blobSizedImage, mImageBlob);
 
         mLoopProfiler.measureEvent("Conversion");
 
@@ -207,7 +207,7 @@ namespace mrover {
             mrover::msg::ImageTarget target;
             target.name = className;
             target.bearing = getTagBearing(blobSizedImage, box);
-            targets.targets.push_back(target);
+            targets.targets.emplace_back(target);
         }
 
         mTargetsPub->publish(targets);
@@ -218,7 +218,7 @@ namespace mrover {
         mLoopProfiler.measureEvent("Publication");
     }
 
-    auto ImageObjectDetector::getTagBearing(cv::InputArray image, cv::Rect const& box) const -> float {
+    auto ImageObjectDetector::getTagBearing(cv::InputArray const& image, cv::Rect const& box) const -> float {
         cv::Point2f center = cv::Point2f{box.tl()} + cv::Point2f{box.size()} / 2;
         float xNormalized = center.x / static_cast<float>(image.cols());
         float xRecentered = 0.5f - xNormalized;