umrover · jbrhm · Sep 23, 2024 · Sep 23, 2024 · Sep 23, 2024 · Sep 23, 2024
diff --git a/perception/object_detector/object_detector.cpp b/perception/object_detector/object_detector.cpp
@@ -4,30 +4,39 @@ namespace mrover {
 
     ObjectDetectorBase::ObjectDetectorBase() : rclcpp::Node(NODE_NAME), mLoopProfiler{get_logger()} {
 
+        std::string modelName;
+        float modelScoreThreshold{};
+        float modelNMSThreshold{};
+
         std::vector<ParameterWrapper> params{
                 {"camera_frame", mCameraFrame, "zed_left_camera_frame"},
                 {"world_frame", mWorldFrame, "map"},
                 {"increment_weight", mObjIncrementWeight, 2},
                 {"decrement_weight", mObjDecrementWeight, 1},
                 {"hitcount_threshold", mObjHitThreshold, 5},
                 {"hitcount_max", mObjMaxHitcount, 10},
-                {"model_name", mModelName, "Large-Dataset"},
-                {"model_score_threshold", mModelScoreThreshold, 0.75},
-                {"model_nms_threshold", mModelNmsThreshold, 0.5}};
+                {"model_name", modelName, "Large-Dataset"},
+                {"model_score_threshold", modelScoreThreshold, 0.75},
+                {"model_nms_threshold", modelNMSThreshold, 0.5}};
 
         ParameterWrapper::declareParameters(this, params);
 
+        // All of these variables will be invalidated after calling this function
+
         std::filesystem::path packagePath = std::filesystem::path{ament_index_cpp::get_package_prefix("mrover")} / ".." / ".." / "src" / "mrover";
 
-        RCLCPP_INFO_STREAM(get_logger(), "Opening Model " << mModelName);
+        RCLCPP_INFO_STREAM(get_logger(), "Opening Model " << modelName);
 
         RCLCPP_INFO_STREAM(get_logger(), "Found package path " << packagePath);
 
-        mTensorRT = TensortRT{mModelName, packagePath.string()};
+        // Initialize TensorRT Inference Object and Get Important Output Information
+        mTensorRT = TensortRT{modelName, packagePath.string()};
+
+        mModel = Model(modelName, {0, 0}, {"bottle", "hammer"}, mTensorRT.getInputTensorSize(), mTensorRT.getOutputTensorSize(), {modelScoreThreshold, modelNMSThreshold}, ObjectDetectorBase::preprocessYOLOv8Input, ObjectDetectorBase::parseYOLOv8Output);
 
         mDebugImgPub = create_publisher<sensor_msgs::msg::Image>("object_detector/debug_img", 1);
 
-        RCLCPP_INFO_STREAM(get_logger(), std::format("Object detector initialized with model: {} and thresholds: {} and {}", mModelName, mModelScoreThreshold, mModelNmsThreshold));
+        RCLCPP_INFO_STREAM(get_logger(), std::format("Object detector initialized with model: {} and thresholds: {} and {}", mModel.modelName, modelScoreThreshold, modelNMSThreshold));
     }
 
     StereoObjectDetector::StereoObjectDetector() {

diff --git a/perception/object_detector/object_detector.hpp b/perception/object_detector/object_detector.hpp
@@ -7,36 +7,57 @@ namespace mrover {
     class ObjectDetectorBase : public rclcpp::Node {
 
     protected:
+        struct Model {
+            std::string modelName;
+
+            std::vector<int> objectHitCounts;
+
+            std::vector<std::string> classes;
+
+            std::vector<int64_t> inputTensorSize;
+
+            std::vector<int64_t> outputTensorSize;
+
+            // Additional space for params that one algorithm might need and not others
+            std::vector<float> buffer;
+
+            // Converts an rgb image to a blob
+            std::function<void(Model const&, cv::Mat&, cv::Mat&, cv::Mat&)> rbgImageToBlob;
+
+            // Converts an output tensor into a vector of detections
+            std::function<void(Model const&, cv::Mat&, std::vector<Detection>&)> outputTensorToDetections;
+
+            Model() = default;
+
+            Model(std::string _modelName, std::vector<int> _objectHitCounts, std::vector<std::string> _classes, std::vector<int64_t> _inputTensorSize, std::vector<int64_t> _outputTensorSize, std::vector<float> _buffer, std::function<void(Model const&, cv::Mat&, cv::Mat&, cv::Mat&)> _rbgImageToBlob, std::function<void(Model const&, cv::Mat&, std::vector<Detection>&)> _outputTensorToDetections) : modelName{std::move(_modelName)}, objectHitCounts(std::move(_objectHitCounts)), classes(std::move(_classes)), inputTensorSize(std::move(_inputTensorSize)), outputTensorSize(std::move(_outputTensorSize)), buffer(std::move(_buffer)), rbgImageToBlob{std::move(_rbgImageToBlob)}, outputTensorToDetections{std::move(_outputTensorToDetections)} {}
+        };
+
         static constexpr char const* NODE_NAME = "object_detector";
 
         std::unique_ptr<tf2_ros::Buffer> mTfBuffer = std::make_unique<tf2_ros::Buffer>(get_clock());
         std::shared_ptr<tf2_ros::TransformBroadcaster> mTfBroadcaster = std::make_shared<tf2_ros::TransformBroadcaster>(this);
         std::shared_ptr<tf2_ros::TransformListener> mTfListener = std::make_shared<tf2_ros::TransformListener>(*mTfBuffer);
 
+        Model mModel;
+
         std::string mCameraFrame;
         std::string mWorldFrame;
 
-        std::string mModelName;
+        cv::Mat mRgbImage, mImageBlob;
 
         LoopProfiler mLoopProfiler;
 
         TensortRT mTensorRT;
 
-        cv::Mat mRgbImage, mImageBlob;
         sensor_msgs::msg::Image mDetectionsImageMessage;
 
         rclcpp::Publisher<sensor_msgs::msg::Image>::SharedPtr mDebugImgPub;
         rclcpp::Subscription<sensor_msgs::msg::PointCloud2>::SharedPtr mSensorSub;
 
-        // TODO(quintin): Do not hard code exactly two classes
-        std::vector<int> mObjectHitCounts{0, 0};
-
         int mObjIncrementWeight{};
         int mObjDecrementWeight{};
         int mObjHitThreshold{};
         int mObjMaxHitcount{};
-        float mModelScoreThreshold{};
-        float mModelNmsThreshold{};
 
         auto spiralSearchForValidPoint(sensor_msgs::msg::PointCloud2::UniquePtr const& cloudPtr,
                                        std::size_t u, std::size_t v,
@@ -50,6 +71,12 @@ namespace mrover {
 
         static auto drawDetectionBoxes(cv::InputOutputArray image, std::span<Detection const> detections) -> void;
 
+        auto static parseYOLOv8Output(Model const& model,
+                                      cv::Mat& output,
+                                      std::vector<Detection>& detections) -> void;
+
+        auto static preprocessYOLOv8Input(Model const& model, cv::Mat& rgbImage, cv::Mat& blobSizedImage, cv::Mat& blob) -> void;
+
     public:
         explicit ObjectDetectorBase();
 

diff --git a/perception/object_detector/object_detector.parsing.cpp b/perception/object_detector/object_detector.parsing.cpp
@@ -0,0 +1,99 @@
+#include "object_detector.hpp"
+namespace mrover {
+    auto ObjectDetectorBase::preprocessYOLOv8Input(Model const& model, cv::Mat& rgbImage, cv::Mat& blobSizedImage, cv::Mat& blob) -> void {
+        if (model.inputTensorSize.size() != 4) {
+            throw std::runtime_error("Expected Blob Size to be of size 4, are you using the correct model type?");
+        }
+
+        if (model.buffer.size() != 2) {
+            throw std::runtime_error("Expected 2 additional parameters!");
+        }
+
+        static constexpr double UCHAR_TO_DOUBLE = 1.0 / 255.0;
+
+        cv::Size blobSize{static_cast<int32_t>(model.inputTensorSize[2]), static_cast<int32_t>(model.inputTensorSize[3])};
+        cv::resize(rgbImage, blobSizedImage, blobSize);
+        cv::dnn::blobFromImage(blobSizedImage, blob, UCHAR_TO_DOUBLE, blobSize, cv::Scalar{}, false, false);
+    }
+
+    auto ObjectDetectorBase::parseYOLOv8Output(Model const& model, cv::Mat& output, std::vector<Detection>& detections) -> void {
+        // Parse model specific dimensioning from the output
+
+        // The input to this function is expecting a YOLOv8 style model, thus the dimensions should be > rows
+        if (output.cols <= output.rows) {
+            std::array<char, 75> message{};
+            snprintf(message.data(), message.size(), "Something is wrong model interpretation, %d cols and %d rows", output.cols, output.rows);
+            throw std::runtime_error(message.data());
+        }
+
+        int rows = output.cols;
+        int dimensions = output.rows;
+
+        // The output of the model is a batchSizex84x8400 matrix
+        // This converts the model to a TODO: Check this dimensioning
+        output = output.reshape(1, dimensions);
+        cv::transpose(output, output);
+
+        // This function expects the image to already be in the correct format thus no distrotion is needed
+        float const xFactor = 1.0;
+        float const yFactor = 1.0;
+
+        // Intermediate Storage Containers
+        std::vector<int> classIds;
+        std::vector<float> confidences;
+        std::vector<cv::Rect> boxes;
+
+        // Each row of the output is a detection with a bounding box and associated class scores
+        for (int r = 0; r < rows; ++r) {
+            // Skip first four values as they are the box data
+            cv::Mat scores = output.row(r).colRange(4, dimensions);
+
+            cv::Point classId;
+            double maxClassScore;
+            cv::minMaxLoc(scores, nullptr, &maxClassScore, nullptr, &classId);
+
+            // Determine if the class is an acceptable confidence level, otherwise disregard
+            if (maxClassScore <= model.buffer[0]) continue;
+
+            confidences.push_back(static_cast<float>(maxClassScore));
+            classIds.push_back(classId.x);
+
+            // Get the bounding box data
+            cv::Mat box = output.row(r).colRange(0, 4);
+            auto x = box.at<float>(0);
+            auto y = box.at<float>(1);
+            auto w = box.at<float>(2);
+            auto h = box.at<float>(3);
+
+            // Cast the corners into integers to be used on pixels
+            auto left = static_cast<int>((x - 0.5 * w) * xFactor);
+            auto top = static_cast<int>((y - 0.5 * h) * yFactor);
+            auto width = static_cast<int>(w * xFactor);
+            auto height = static_cast<int>(h * yFactor);
+
+            // Push abck the box into storage
+            boxes.emplace_back(left, top, width, height);
+        }
+
+        //Coalesce the boxes into a smaller number of distinct boxes
+        std::vector<int> nmsResult;
+        cv::dnn::NMSBoxes(boxes, confidences, model.buffer[0], model.buffer[1], nmsResult);
+
+        // Fill in the output Detections Vector
+        for (int i: nmsResult) {
+            Detection result;
+
+            //Fill in the id and confidence for the class
+            result.classId = classIds[i];
+            result.confidence = confidences[i];
+
+            //Fill in the class name and box information
+            result.className = model.classes[result.classId];
+            result.box = boxes[i];
+
+            //Push back the detection into the for storagevector
+            detections.push_back(result);
+        }
+    }
+
+}; // namespace mrover
diff --git a/perception/object_detector/object_detector.processing.cpp b/perception/object_detector/object_detector.processing.cpp
@@ -14,19 +14,26 @@ namespace mrover {
             RCLCPP_INFO_STREAM(get_logger(), std::format("Image size changed from [{}, {}] to [{}, {}]", mRgbImage.cols, mRgbImage.rows, msg->width, msg->height));
             mRgbImage = cv::Mat{static_cast<int>(msg->height), static_cast<int>(msg->width), CV_8UC3, cv::Scalar{0, 0, 0, 0}};
         }
+
+        // if 0x0 image
+        if (!mRgbImage.total()) {
+            return;
+        }
+
         convertPointCloudToRGB(msg, mRgbImage);
 
-        // TODO(quintin): Avoid hard coding blob size
-        cv::Size blobSize{640, 640};
+        // Convert the RGB Image into the blob Image format
         cv::Mat blobSizedImage;
-        cv::resize(mRgbImage, blobSizedImage, blobSize);
-        cv::dnn::blobFromImage(blobSizedImage, mImageBlob, 1.0 / 255.0, blobSize, cv::Scalar{}, false, false);
+        mModel.rbgImageToBlob(mModel, mRgbImage, blobSizedImage, mImageBlob);
 
         mLoopProfiler.measureEvent("Conversion");
 
         // Run the blob through the model
         std::vector<Detection> detections{};
-        mTensorRT.modelForwardPass(mImageBlob, detections, mModelScoreThreshold, mModelNmsThreshold);
+        cv::Mat outputTensor;
+        mTensorRT.modelForwardPass(mImageBlob, outputTensor);
+
+        mModel.outputTensorToDetections(mModel, outputTensor, detections);
 
         mLoopProfiler.measureEvent("Execution");
 
@@ -66,10 +73,10 @@ namespace mrover {
                     // Push the immediate detections to the camera frame
                     SE3Conversions::pushToTfTree(*mTfBroadcaster, objectImmediateFrame, mCameraFrame, objectInCamera.value(), get_clock()->now());
                     // Since the object is seen we need to increment the hit counter
-                    mObjectHitCounts[classId] = std::min(mObjMaxHitcount, mObjectHitCounts[classId] + mObjIncrementWeight);
+                    mModel.objectHitCounts[classId] = std::min(mObjMaxHitcount, mModel.objectHitCounts[classId] + mObjIncrementWeight);
 
                     // Only publish to permament if we are confident in the object
-                    if (mObjectHitCounts[classId] > mObjHitThreshold) {
+                    if (mModel.objectHitCounts[classId] > mObjHitThreshold) {
                         std::string objectPermanentFrame = className;
                         // Grab the object inside of the camera frame and push it into the map frame
                         SE3d objectInMap = SE3Conversions::fromTfTree(*mTfBuffer, objectImmediateFrame, mWorldFrame);
@@ -90,7 +97,7 @@ namespace mrover {
             if (seenObjects[i]) continue;
 
             assert(i < mObjectHitCounts.size());
-            mObjectHitCounts[i] = std::max(0, mObjectHitCounts[i] - mObjDecrementWeight);
+            mModel.objectHitCounts[i] = std::max(0, mModel.objectHitCounts[i] - mObjDecrementWeight);
         }
     }
 

diff --git a/perception/object_detector/pch.hpp b/perception/object_detector/pch.hpp
@@ -11,6 +11,7 @@
 #include <filesystem>
 #include <format>
 #include <fstream>
+#include <functional>
 #include <iostream>
 #include <limits>
 #include <memory>