hsp-iit · mga-97 · Aug 12, 2024 · Aug 12, 2024 · Aug 8, 2024 · Sep 6, 2024
diff --git a/app/speechPipeline/scripts/speechPipelineMic.xml b/app/speechPipeline/scripts/speechPipelineMic.xml
@@ -26,7 +26,25 @@
   </module>
 
   <module>
-    <name>voiceActivationDetection</name>
+    <name>yarprobotinterface </name>
+    <parameters>--context headSynchronizer --from faceDisplay.ini</parameters>
+    <node>r1-face</node>
+  </module>
+
+  <module>
+    <name>yarprobotinterface </name>
+    <parameters>--context vadModule --from audioPlayer.ini</parameters>
+    <node>r1-face</node>
+  </module>
+
+  <module>
+    <name>faceExpressionImage5GTour</name>
+    <parameters></parameters>
+    <node>console</node>
+  </module>
+
+  <module>
+    <name>sileroVAD</name>
     <parameters></parameters>
     <environment>YARP_LOG_PROCESS_LABEL=VAD</environment>
     <node>console</node>
@@ -69,4 +87,22 @@
     <protocol>tcp+recv.portmonitor+file.soundfilter_resample+type.dll+channel.0+frequency.16000</protocol>
   </connection>
 
+  <connection>
+    <from>/faceExpressionImage/image:o</from>
+    <to>/robot/faceDisplay/image:i</to>
+    <protocol>fast_tcp</protocol>
+  </connection>
+
+  <connection>
+    <from>/notification:i</from>
+    <to>/audioPlayerWrapper/audio:i</to>
+    <protocol>tcp+recv.portmonitor+file.soundfilter_resample+type.dll+channel.0+frequency.16000</protocol>
+  </connection>
+
+  <connection>
+    <from>/wake/face:o</from>
+    <to>/faceExpressionImage/rpc </to>
+    <protocol>fast_tcp</protocol>
+  </connection>
+
 </application>
diff --git a/aux_modules/faceExpression/earsThread.cpp b/aux_modules/faceExpression/earsThread.cpp
@@ -76,6 +76,7 @@ bool EarsThread::threadInit()
     m_blackBar.setTo(Scalar(0, 0, 0));
 
     this->resetToDefault();
+    m_earBar.setTo(Scalar(0,0,255));
     return true;
 }
 
@@ -89,33 +90,12 @@ void EarsThread::run()
         return;
     }
 
-    float percentage = 0.5;
-
-    yarp::dev::AudioRecorderStatus *rec_status = m_audioStatusPort.read(false);
-    if (rec_status)
-    {
-        m_micIsEnabled = rec_status->enabled; //&& rec_status->current_buffer_size > 0;
-        //yInfo() << rec_status->current_buffer_size;
-    //    yInfo() << m_micIsEnabled;
-    }
 
     yarp::sig::Sound* data_audio = m_audioRecPort.read(false);
     if(m_doBars)
     {
-        if(data_audio){
-            auto vec= data_audio->getChannel(0);
-      	    short int max_val = *std::max_element(vec.begin(),vec.end());
-//            max_val = 30000;
-            percentage = fabs((float)max_val / 32800);
-            updateBars(percentage);
-            yInfo() << percentage;
-        }
+        updateBars(1);
     }
-    else
-    {
-        updateBars(0.5);
-    }
-
 }
 
 bool EarsThread::updateBars(float percentage)
@@ -125,18 +105,6 @@ bool EarsThread::updateBars(float percentage)
     earBar0_len = earBar0_minLen + (earBar0_maxLen - earBar0_minLen) *  percentage;
     earBar1_len = earBar1_minLen + (earBar1_maxLen - earBar1_minLen) *  percentage;
 
-    if(m_micIsEnabled==false){
-        m_earBar.setTo(Scalar(0,0,255));
-        percentage=0.5;
-    }
-    else
-    {
-       m_earBar.setTo(m_earsDefaultColor);
-    }
-    if(percentage>0.85){
-        m_earBar.setTo(Scalar(255,0,0));
-    }
-
 
     // Reset bars to black
     m_blackBar(Rect(0, 0, barWidth, 32)).copyTo  (m_face(cv::Rect(earBarL0_x,  0, barWidth, 32)));
@@ -165,7 +133,7 @@ void EarsThread::resetToDefault()
     lock_guard<recursive_mutex> lg(m_methods_mutex);
     m_doBars = true;
     m_earBar.setTo(m_earsDefaultColor);
-    updateBars(0.5);
+    updateBars(1);
 }
 
 void EarsThread::setColor(float vr, float vg, float vb)

diff --git a/aux_modules/speechProcessing/CMakeLists.txt b/aux_modules/speechProcessing/CMakeLists.txt
@@ -2,6 +2,8 @@ add_subdirectory(messages)
 if(WAKE_WORD)
     add_subdirectory(wakeWordDetection)
 endif()
+add_subdirectory(sileroVAD)
+
 find_package(PkgConfig REQUIRED)
 pkg_check_modules(libfvad IMPORTED_TARGET libfvad)
 if(${libfvad_FOUND})

diff --git a/aux_modules/speechProcessing/sileroVAD/CMakeLists.txt b/aux_modules/speechProcessing/sileroVAD/CMakeLists.txt
@@ -0,0 +1,39 @@
+################################################################################
+#                                                                              #
+# Copyright (C) 2020 Fondazione Istituto Italiano di Tecnologia (IIT)          #
+# All Rights Reserved.                                                         #
+#                                                                              #
+################################################################################
+project(sileroVAD)
+SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+set(AUX_NAME sileroVAD)
+add_executable(${AUX_NAME})
+target_include_directories(${AUX_NAME}
+  PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_BINARY_DIR}
+    $<TARGET_PROPERTY:WakeMsgs,INTERFACE_INCLUDE_DIRECTORIES>
+    /usr/local/src/robot/onnxruntime-linux-x64-1.20.0/include
+)
+target_sources(${AUX_NAME}
+  PRIVATE
+    main.cpp
+    Detector.h
+    Detector.cpp
+    VoiceActivationDetectionModule.h
+    VoiceActivationDetectionModule.cpp
+)
+
+
+target_sources(${AUX_NAME}
+  PRIVATE
+    ${STATECHARTS_FILES})
+
+target_link_libraries(${AUX_NAME}
+  PRIVATE
+    ${YARP_LIBRARIES}
+    WakeMsgs
+    /usr/local/src/robot/onnxruntime-linux-x64-1.20.0/lib/libonnxruntime.so)
+
+
+install(TARGETS ${AUX_NAME} DESTINATION bin)
diff --git a/aux_modules/speechProcessing/sileroVAD/Detector.cpp b/aux_modules/speechProcessing/sileroVAD/Detector.cpp
@@ -0,0 +1,191 @@
+// SPDX-FileCopyrightText: 2022 Humanoid Sensing and Perception, Istituto Italiano di Tecnologia
+// SPDX-License-Identifier: BSD-3-Clause
+#ifdef _WIN32
+#include <Windows.h>
+#else
+#include <unistd.h>
+#endif
+#include <cstdlib>
+
+#include <iostream>
+#include <cstdint>
+#include <stdexcept>
+#include "Detector.h"
+
+YARP_LOG_COMPONENT(VADAUDIOPROCESSOR, "behavior_tour_robot.voiceActivationDetection.AudioProcessor", yarp::os::Log::TraceType)
+
+Detector::Detector(int vadFrequency,
+                    int gapAllowance,
+                    bool saveGap,
+                    float threshold,
+                    int vadSavePriorToDetection,
+                    const std::string modelPath,
+                    std::string filteredAudioPortOutName,
+                    std::string wakeWordClientPort):
+                    m_vadFrequency(vadFrequency),
+                    m_vadGapAllowance(gapAllowance),
+                    m_vadSaveGap(saveGap),
+                    m_vadThreshold(threshold),
+                    m_vadSavePriorToDetection(vadSavePriorToDetection),
+                    m_vadNumSamples((vadFrequency == 16000) ? 512 :
+                                    (vadFrequency == 8000) ? 256 :
+                                    throw std::runtime_error("Unsupported sample rate")),
+                    m_context((vadFrequency == 16000) ? 64 : 32, 0),
+                    m_currentSoundBufferNorm(m_vadNumSamples, 0),
+                    m_currentSoundBuffer(m_vadNumSamples, 0),
+                    m_fillCount(0) {
+
+    init_onnx_model(modelPath);
+
+    m_input.resize(m_context.size() + m_vadNumSamples);
+    m_input_node_dims[0] = 1;
+    m_input_node_dims[1] = m_context.size() + m_currentSoundBuffer.size();
+
+    m_state.resize(m_size_state);
+    m_sr.resize(1);
+    m_sr[0] = m_vadFrequency;
+
+    reset_states();
+
+    if (!m_rpcClientPort.open(wakeWordClientPort)){
+        yCError(VADAUDIOPROCESSOR) << "cannot open port" << wakeWordClientPort;
+    }
+    m_rpcClient.yarp().attachAsClient(m_rpcClientPort);
+
+    if (!m_filteredAudioOutputPort.open(filteredAudioPortOutName)){
+        yCError(VADAUDIOPROCESSOR) << "cannot open port" << wakeWordClientPort;
+    }
+}
+
+void Detector::init_engine_threads(int inter_threads, int intra_threads) {
+    m_session_options.SetIntraOpNumThreads(intra_threads);
+    m_session_options.SetInterOpNumThreads(inter_threads);
+    m_session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_DISABLE_ALL);
+};
+
+void Detector::init_onnx_model(const std::string& model_path) {
+    init_engine_threads(1, 1);
+    m_session = std::make_shared<Ort::Session>(m_env, model_path.c_str(), m_session_options);
+};
+
+void Detector::reset_states() {
+    std::memset(m_state.data(), 0.0f, m_state.size() * sizeof(float));
+};
+
+void Detector::predict(const std::vector<float> &data) {
+    // Create ort tensors
+    std::copy(m_context.begin(), m_context.end(), m_input.begin());
+    std::copy(m_currentSoundBuffer.begin(), m_currentSoundBuffer.end(), m_input.begin() + m_context.size()); 
+    Ort::Value input_ort = Ort::Value::CreateTensor<float>(
+        m_memory_info, m_input.data(), m_input.size(), m_input_node_dims, 2);
+    Ort::Value state_ort = Ort::Value::CreateTensor<float>(
+        m_memory_info, m_state.data(), m_state.size(), m_state_node_dims, 3);
+    Ort::Value sr_ort = Ort::Value::CreateTensor<int64_t>(
+        m_memory_info, m_sr.data(), m_sr.size(), m_sr_node_dims, 1);
+
+    // Clear and add inputs
+    m_ort_inputs.clear();
+    m_ort_inputs.emplace_back(std::move(input_ort));
+    m_ort_inputs.emplace_back(std::move(state_ort));
+    m_ort_inputs.emplace_back(std::move(sr_ort));
+
+    // Infer
+    m_ort_outputs = m_session->Run(
+        Ort::RunOptions{nullptr},
+        m_input_node_names.data(), m_ort_inputs.data(), m_ort_inputs.size(),
+        m_output_node_names.data(), m_output_node_names.size());
+
+    // Output probability & update h,c recursively
+    float speech_prob = m_ort_outputs[0].GetTensorMutableData<float>()[0];
+    float *stateN = m_ort_outputs[1].GetTensorMutableData<float>();
+    std::memcpy(m_state.data(), stateN, m_size_state * sizeof(float));
+
+    bool isTalking = speech_prob > m_vadThreshold;
+    if (isTalking) { 
+        yCDebug(VADAUDIOPROCESSOR) << "Voice detected adding to send buffer";
+        m_soundDetected = true;
+        m_soundToSend.push_back(m_currentSoundBuffer);
+        m_gapCounter = 0;
+    } else {
+        if (m_soundDetected)
+        {
+            ++m_gapCounter;
+            if (m_gapCounter > m_vadGapAllowance)
+            {
+                yCDebug(VADAUDIOPROCESSOR) << "End of of speech";
+                sendSound();
+                m_soundToSend.clear();
+                m_soundDetected = false;
+                m_rpcClient.stop();
+                reset_states();
+            }
+            else if (m_vadSaveGap)
+            {
+                m_soundToSend.push_back(m_currentSoundBuffer);
+            }
+        }
+        else if (m_vadSavePriorToDetection > 0)
+        {
+            m_soundToSend.push_back(m_currentSoundBuffer);
+            if (m_soundToSend.size() > m_vadSavePriorToDetection)
+            {
+                m_soundToSend.pop_front();
+            }
+        }
+
+    }
+
+    // copy last part into context for next input
+    std::copy(
+        m_currentSoundBuffer.end() - m_context.size(),
+        m_currentSoundBuffer.end(),  
+        m_context.begin()                     
+    );
+};
+
+
+void Detector::onRead(yarp::sig::Sound& soundReceived) {
+    size_t num_samples = soundReceived.getSamples();
+
+    for (size_t i = 0; i < num_samples; i++)
+    {
+        m_currentSoundBuffer.at(m_fillCount) = soundReceived.get(i);
+        m_currentSoundBufferNorm.at(m_fillCount) = static_cast<float>(soundReceived.get(i)) / INT16_MAX;
+        ++m_fillCount;
+        if (m_fillCount == m_currentSoundBuffer.size()) {
+            predict(m_currentSoundBufferNorm);
+            m_fillCount = 0;
+        }
+    } 
+
+}
+
+
+void Detector::sendSound() {
+    int packetsWithSound = m_soundToSend.size();
+
+    yarp::sig::Sound& soundToSend = m_filteredAudioOutputPort.prepare();
+    yCDebug(VADAUDIOPROCESSOR) << ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> sending recorded voice sound";
+
+
+    int totalPackets = packetsWithSound < m_minSoundSize ? m_minSoundSize : packetsWithSound;
+    int numSamples = m_currentSoundBuffer.size() * totalPackets;
+    soundToSend.resize(numSamples);
+    soundToSend.setFrequency(m_vadFrequency);
+    for (size_t p = 0; p < packetsWithSound; ++p)
+    {
+        for (size_t i = 0; i < m_currentSoundBuffer.size(); i++)
+        {
+            soundToSend.set(m_soundToSend[p].at(i), i + (p * m_currentSoundBuffer.size()));
+        }
+    }
+
+    // padding to minimum size
+    for (size_t i = packetsWithSound * m_currentSoundBuffer.size(); i < numSamples; i++)
+    {
+        soundToSend.set(0, i);
+    }
+
+    m_filteredAudioOutputPort.write();
+}
+