Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feedback when R1 is listening when using wake word + VAD #76

Draft
wants to merge 14 commits into
base: iron
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 37 additions & 1 deletion app/speechPipeline/scripts/speechPipelineMic.xml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,25 @@
</module>

<module>
<name>voiceActivationDetection</name>
<name>yarprobotinterface </name>
<parameters>--context headSynchronizer --from faceDisplay.ini</parameters>
<node>r1-face</node>
</module>

<module>
<name>yarprobotinterface </name>
<parameters>--context vadModule --from audioPlayer.ini</parameters>
<node>r1-face</node>
</module>

<module>
<name>faceExpressionImage5GTour</name>
<parameters></parameters>
<node>console</node>
</module>

<module>
<name>sileroVAD</name>
<parameters></parameters>
<environment>YARP_LOG_PROCESS_LABEL=VAD</environment>
<node>console</node>
Expand Down Expand Up @@ -69,4 +87,22 @@
<protocol>tcp+recv.portmonitor+file.soundfilter_resample+type.dll+channel.0+frequency.16000</protocol>
</connection>

<connection>
<from>/faceExpressionImage/image:o</from>
<to>/robot/faceDisplay/image:i</to>
<protocol>fast_tcp</protocol>
</connection>

<connection>
<from>/notification:i</from>
<to>/audioPlayerWrapper/audio:i</to>
<protocol>tcp+recv.portmonitor+file.soundfilter_resample+type.dll+channel.0+frequency.16000</protocol>
</connection>

<connection>
<from>/wake/face:o</from>
<to>/faceExpressionImage/rpc </to>
<protocol>fast_tcp</protocol>
</connection>

</application>
38 changes: 3 additions & 35 deletions aux_modules/faceExpression/earsThread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ bool EarsThread::threadInit()
m_blackBar.setTo(Scalar(0, 0, 0));

this->resetToDefault();
m_earBar.setTo(Scalar(0,0,255));
return true;
}

Expand All @@ -89,33 +90,12 @@ void EarsThread::run()
return;
}

float percentage = 0.5;

yarp::dev::AudioRecorderStatus *rec_status = m_audioStatusPort.read(false);
if (rec_status)
{
m_micIsEnabled = rec_status->enabled; //&& rec_status->current_buffer_size > 0;
//yInfo() << rec_status->current_buffer_size;
// yInfo() << m_micIsEnabled;
}

yarp::sig::Sound* data_audio = m_audioRecPort.read(false);
if(m_doBars)
{
if(data_audio){
auto vec= data_audio->getChannel(0);
short int max_val = *std::max_element(vec.begin(),vec.end());
// max_val = 30000;
percentage = fabs((float)max_val / 32800);
updateBars(percentage);
yInfo() << percentage;
}
updateBars(1);
}
else
{
updateBars(0.5);
}

}

bool EarsThread::updateBars(float percentage)
Expand All @@ -125,18 +105,6 @@ bool EarsThread::updateBars(float percentage)
earBar0_len = earBar0_minLen + (earBar0_maxLen - earBar0_minLen) * percentage;
earBar1_len = earBar1_minLen + (earBar1_maxLen - earBar1_minLen) * percentage;

if(m_micIsEnabled==false){
m_earBar.setTo(Scalar(0,0,255));
percentage=0.5;
}
else
{
m_earBar.setTo(m_earsDefaultColor);
}
if(percentage>0.85){
m_earBar.setTo(Scalar(255,0,0));
}


// Reset bars to black
m_blackBar(Rect(0, 0, barWidth, 32)).copyTo (m_face(cv::Rect(earBarL0_x, 0, barWidth, 32)));
Expand Down Expand Up @@ -165,7 +133,7 @@ void EarsThread::resetToDefault()
lock_guard<recursive_mutex> lg(m_methods_mutex);
m_doBars = true;
m_earBar.setTo(m_earsDefaultColor);
updateBars(0.5);
updateBars(1);
}

void EarsThread::setColor(float vr, float vg, float vb)
Expand Down
2 changes: 2 additions & 0 deletions aux_modules/speechProcessing/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ add_subdirectory(messages)
if(WAKE_WORD)
add_subdirectory(wakeWordDetection)
endif()
add_subdirectory(sileroVAD)

find_package(PkgConfig REQUIRED)
pkg_check_modules(libfvad IMPORTED_TARGET libfvad)
if(${libfvad_FOUND})
Expand Down
39 changes: 39 additions & 0 deletions aux_modules/speechProcessing/sileroVAD/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
################################################################################
# #
# Copyright (C) 2020 Fondazione Istituto Italiano di Tecnologia (IIT) #
# All Rights Reserved. #
# #
################################################################################
project(sileroVAD)
SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
set(AUX_NAME sileroVAD)
add_executable(${AUX_NAME})
target_include_directories(${AUX_NAME}
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_BINARY_DIR}
$<TARGET_PROPERTY:WakeMsgs,INTERFACE_INCLUDE_DIRECTORIES>
/usr/local/src/robot/onnxruntime-linux-x64-1.20.0/include
)
target_sources(${AUX_NAME}
PRIVATE
main.cpp
Detector.h
Detector.cpp
VoiceActivationDetectionModule.h
VoiceActivationDetectionModule.cpp
)


target_sources(${AUX_NAME}
PRIVATE
${STATECHARTS_FILES})

target_link_libraries(${AUX_NAME}
PRIVATE
${YARP_LIBRARIES}
WakeMsgs
/usr/local/src/robot/onnxruntime-linux-x64-1.20.0/lib/libonnxruntime.so)


install(TARGETS ${AUX_NAME} DESTINATION bin)
191 changes: 191 additions & 0 deletions aux_modules/speechProcessing/sileroVAD/Detector.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
// SPDX-FileCopyrightText: 2022 Humanoid Sensing and Perception, Istituto Italiano di Tecnologia
// SPDX-License-Identifier: BSD-3-Clause
#ifdef _WIN32
#include <Windows.h>
#else
#include <unistd.h>
#endif
#include <cstdlib>

#include <iostream>
#include <cstdint>
#include <stdexcept>
#include "Detector.h"

YARP_LOG_COMPONENT(VADAUDIOPROCESSOR, "behavior_tour_robot.voiceActivationDetection.AudioProcessor", yarp::os::Log::TraceType)

Detector::Detector(int vadFrequency,
int gapAllowance,
bool saveGap,
float threshold,
int vadSavePriorToDetection,
const std::string modelPath,
std::string filteredAudioPortOutName,
std::string wakeWordClientPort):
m_vadFrequency(vadFrequency),
m_vadGapAllowance(gapAllowance),
m_vadSaveGap(saveGap),
m_vadThreshold(threshold),
m_vadSavePriorToDetection(vadSavePriorToDetection),
m_vadNumSamples((vadFrequency == 16000) ? 512 :
(vadFrequency == 8000) ? 256 :
throw std::runtime_error("Unsupported sample rate")),
m_context((vadFrequency == 16000) ? 64 : 32, 0),
m_currentSoundBufferNorm(m_vadNumSamples, 0),
m_currentSoundBuffer(m_vadNumSamples, 0),
m_fillCount(0) {

init_onnx_model(modelPath);

m_input.resize(m_context.size() + m_vadNumSamples);
m_input_node_dims[0] = 1;
m_input_node_dims[1] = m_context.size() + m_currentSoundBuffer.size();

m_state.resize(m_size_state);
m_sr.resize(1);
m_sr[0] = m_vadFrequency;

reset_states();

if (!m_rpcClientPort.open(wakeWordClientPort)){
yCError(VADAUDIOPROCESSOR) << "cannot open port" << wakeWordClientPort;
}
m_rpcClient.yarp().attachAsClient(m_rpcClientPort);

if (!m_filteredAudioOutputPort.open(filteredAudioPortOutName)){
yCError(VADAUDIOPROCESSOR) << "cannot open port" << wakeWordClientPort;
}
}

void Detector::init_engine_threads(int inter_threads, int intra_threads) {
m_session_options.SetIntraOpNumThreads(intra_threads);
m_session_options.SetInterOpNumThreads(inter_threads);
m_session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_DISABLE_ALL);
};

void Detector::init_onnx_model(const std::string& model_path) {
init_engine_threads(1, 1);
m_session = std::make_shared<Ort::Session>(m_env, model_path.c_str(), m_session_options);
};

void Detector::reset_states() {
std::memset(m_state.data(), 0.0f, m_state.size() * sizeof(float));
};

void Detector::predict(const std::vector<float> &data) {
// Create ort tensors
std::copy(m_context.begin(), m_context.end(), m_input.begin());
std::copy(m_currentSoundBuffer.begin(), m_currentSoundBuffer.end(), m_input.begin() + m_context.size());
Ort::Value input_ort = Ort::Value::CreateTensor<float>(
m_memory_info, m_input.data(), m_input.size(), m_input_node_dims, 2);
Ort::Value state_ort = Ort::Value::CreateTensor<float>(
m_memory_info, m_state.data(), m_state.size(), m_state_node_dims, 3);
Ort::Value sr_ort = Ort::Value::CreateTensor<int64_t>(
m_memory_info, m_sr.data(), m_sr.size(), m_sr_node_dims, 1);

// Clear and add inputs
m_ort_inputs.clear();
m_ort_inputs.emplace_back(std::move(input_ort));
m_ort_inputs.emplace_back(std::move(state_ort));
m_ort_inputs.emplace_back(std::move(sr_ort));

// Infer
m_ort_outputs = m_session->Run(
Ort::RunOptions{nullptr},
m_input_node_names.data(), m_ort_inputs.data(), m_ort_inputs.size(),
m_output_node_names.data(), m_output_node_names.size());

// Output probability & update h,c recursively
float speech_prob = m_ort_outputs[0].GetTensorMutableData<float>()[0];
float *stateN = m_ort_outputs[1].GetTensorMutableData<float>();
std::memcpy(m_state.data(), stateN, m_size_state * sizeof(float));

bool isTalking = speech_prob > m_vadThreshold;
if (isTalking) {
yCDebug(VADAUDIOPROCESSOR) << "Voice detected adding to send buffer";
m_soundDetected = true;
m_soundToSend.push_back(m_currentSoundBuffer);
m_gapCounter = 0;
} else {
if (m_soundDetected)
{
++m_gapCounter;
if (m_gapCounter > m_vadGapAllowance)
{
yCDebug(VADAUDIOPROCESSOR) << "End of of speech";
sendSound();
m_soundToSend.clear();
m_soundDetected = false;
m_rpcClient.stop();
reset_states();
}
else if (m_vadSaveGap)
{
m_soundToSend.push_back(m_currentSoundBuffer);
}
}
else if (m_vadSavePriorToDetection > 0)
{
m_soundToSend.push_back(m_currentSoundBuffer);
if (m_soundToSend.size() > m_vadSavePriorToDetection)
{
m_soundToSend.pop_front();
}
}

}

// copy last part into context for next input
std::copy(
m_currentSoundBuffer.end() - m_context.size(),
m_currentSoundBuffer.end(),
m_context.begin()
);
};


void Detector::onRead(yarp::sig::Sound& soundReceived) {
size_t num_samples = soundReceived.getSamples();

for (size_t i = 0; i < num_samples; i++)
{
m_currentSoundBuffer.at(m_fillCount) = soundReceived.get(i);
m_currentSoundBufferNorm.at(m_fillCount) = static_cast<float>(soundReceived.get(i)) / INT16_MAX;
++m_fillCount;
if (m_fillCount == m_currentSoundBuffer.size()) {
predict(m_currentSoundBufferNorm);
m_fillCount = 0;
}
}

}


void Detector::sendSound() {
int packetsWithSound = m_soundToSend.size();

yarp::sig::Sound& soundToSend = m_filteredAudioOutputPort.prepare();
yCDebug(VADAUDIOPROCESSOR) << ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> sending recorded voice sound";


int totalPackets = packetsWithSound < m_minSoundSize ? m_minSoundSize : packetsWithSound;
int numSamples = m_currentSoundBuffer.size() * totalPackets;
soundToSend.resize(numSamples);
soundToSend.setFrequency(m_vadFrequency);
for (size_t p = 0; p < packetsWithSound; ++p)
{
for (size_t i = 0; i < m_currentSoundBuffer.size(); i++)
{
soundToSend.set(m_soundToSend[p].at(i), i + (p * m_currentSoundBuffer.size()));
}
}

// padding to minimum size
for (size_t i = packetsWithSound * m_currentSoundBuffer.size(); i < numSamples; i++)
{
soundToSend.set(0, i);
}

m_filteredAudioOutputPort.write();
}

Loading