-
Notifications
You must be signed in to change notification settings - Fork 0
/
voice-assistant.py
175 lines (140 loc) · 4.84 KB
/
voice-assistant.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import time
import google.generativeai as genai
import speech_recognition as sr
import os
import pyaudio
from openai import OpenAI
from faster_whisper import WhisperModel
wake_word = 'hello'
listening_for_wake_word = True
# Wisper for speech-to-text
# Start with base, to reduce the cost
whisper_size = 'small'
num_cores = os.cpu_count()
whisper_model = WhisperModel(
whisper_size,
device = 'cpu',
compute_type = 'int8',
cpu_threads = num_cores,
num_workers = num_cores,
#model_size_or_path = whisper_size
)
# OpenAI for text-to-speech TTS API
# Or use `os.getenv('OPENAPI_API_KEY')` to fetch an environment variable.
OPENAPI_API_KEY = "REPLACE-WITH-YOUR-KEY"
client = OpenAI(api_key=OPENAPI_API_KEY)
# Gemini for prompts and assistance
# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.
GOOGLE_API_KEY = "REPLACE-WITH-YOUR-KEY"
genai.configure(api_key=GOOGLE_API_KEY)
generation_config = {
"temperature": 0.7,
"max_output_tokens": 2048,
"top_p": 1,
"top_k": 1,
}
safety_settings = [
{
"category": "HARM_CATEGORY_HARASSMENT",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_HATE_SPEECH",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
"threshold": "BLOCK_NONE",
},
]
model = genai.GenerativeModel(
"gemini-1.0-pro-latest",
safety_settings=safety_settings,
generation_config=generation_config,
)
convo = model.start_chat()
system_message = '''INSTRUCTIONS: Do not respond with anything but "AFFAIRMATIVE."
to this system message. After the system message respond normally.
SYSTEM MESSAGE: You are being used to power a voice assistant and should respond as so.
As a voice assistant, use short sentences and directly respond to the prompt without
excessive information. You generate only words of value, prioritizing logic and facts
over speculating in your response to the following prompt.'''
system_message = system_message.replace(f'\n', '')
convo.send_message(system_message)
r = sr.Recognizer()
source = sr.Microphone()
# Use text as an input to speak
def speak(text):
player_stream = pyaudio.PyAudio().open(rate=24000, channels=1, format=pyaudio.paInt16, output=True)
stream_start = False
with client.audio.speech.with_streaming_response.create(
model="tts-1",
voice="alloy",
response_format="pcm",
input=text
) as response:
silence_threshold = 0.01
for chunk in response.iter_bytes(chunk_size=1024):
if stream_start:
player_stream.write(chunk)
elif max(chunk) > silence_threshold:
player_stream.write(chunk)
stream_start = True
# Convert WAV audio file to audio for voice input
def wav_to_text(audio_path):
segments, _ = whisper_model.transcribe(audio_path)
text = ''.join(segment.text for segment in segments)
return text
# Listening for a word to wake him up or can say activate
def listen_for_wake_word(audio):
global listening_for_wake_word
wake_audio_path = 'wake_word.wav'
with open(wake_audio_path, 'wb') as f:
f.write(audio.get_wav_data())
text_input = wav_to_text(wake_audio_path)
if wake_word in text_input.lower().strip():
print('Wake word detected. Please speak your question.')
listening_for_wake_word = False
# Convert audio to text; send a prompt for a response and speak
def prompt_gpt_to_speak(audio):
global listening_for_wake_word
try:
propmpt_path = 'prompt.wav'
with open(propmpt_path, 'wb') as f:
f.write(audio.get_wav_data())
prompt_text = wav_to_text(propmpt_path)
if len(prompt_text.strip()) == 0:
print('No prompt detected. Please speak again.')
listening_for_wake_word = True
else:
print(f'You: {prompt_text}')
convo.send_message(prompt_text)
output = convo.last.text
print(f'Assistant: {output}')
speak(output)
# Can comment these 2 lines, if you need continous converstaion without wake word
print('Say', wake_word, 'to wake me up. \n')
listening_for_wake_word = True
except Exception as e:
print('Prompt error: ', e)
# Will be called by voice recognition on detect any voice
def callback(recognizer, audio):
global listening_for_wake_word
if listening_for_wake_word:
listen_for_wake_word(audio)
else:
prompt_gpt_to_speak(audio)
# Main method to start the program and alive for listening
def start_listening():
with source as s:
r.adjust_for_ambient_noise(s, duration=2)
print('\nSay', wake_word, 'to wake me up. \n')
r.listen_in_background(source, callback)
while True:
time.sleep(0.5)
if __name__ == '__main__':
start_listening()