-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
278 lines (225 loc) · 9.31 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
import openai
import pyttsx3 as tts
from gtts import gTTS
import torch
# from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, set_seed
import speech_recognition as sr
from pygame import mixer
import time
from py_error_handler import noalsaerr # to suppress ALSA lib dumps
import os
import requests
import random
import json
from dotenv import load_dotenv
# load environment variable from .env file
load_dotenv()
# get OpenAI API key
openai.api_key = os.environ['OPENAI_API_KEY']
# get eleven labs API key
elevenlabs_api_key = os.environ['ELEVENLABS_API_KEY']
# get fastsppech API token from huggingface
hf_api_token = os.environ['HF_API_TOKEN']
# activate (Tset to True) or deactivate (set to False) sound that play in idle sessions
activate_idle_state_sound = True
# configure system response context and keep track of conversation thread
msg_thread = [
{"role": "system", "content": "you are a very knowlegable assistant. Explain using first principles like physicist Richard Feynmann."},
]
# initialize module for loading and playing sounds
mixer.init()
# text-to-speech engine instance
tts_engine = tts.init() # TODO add a much smoother speech synthesizer
# functions for transcribing sound from file to text
def sr_transcribe(file_name):
recognizer = sr.Recognizer()
with sr.AudioFile(file_name) as source:
audio_data = recognizer.record(source)
try:
return recognizer.recognize_google(audio_data)
except:
print("Ah, let's give this another try.")
def whisper_transcribe(file_name):
audio_data= open(file_name, "rb")
transcript = openai.Audio.transcribe("whisper-1", audio_data)
return transcript["text"]
# functions using OpenAI API to generate reponse to user query
def get_prompt_response(prompt):
response = openai.Completion.create(engine="text-davinci-003",
prompt=prompt,
max_tokens=4000,
n=1,
stop=None,
temperature=0.5)
return response["choices"][0]["text"]
def get_chat_response(messages):
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages
)
return response["choices"][0]["message"]["content"]
# fetches updated transcript for each session
def get_transcript(msg_thread):
# format message thread
chat_transcript = ""
for message in msg_thread:
if message["role"] != "system":
chat_transcript += message["role"] + \
": " + message["content"] + "\n\n"
return chat_transcript
# functions for text-to-speech synthesis
def pyttsx3_tts(text):
tts_engine.say(text)
tts_engine.runAndWait()
# TODO issues with incomplete and slow text read out
def gTranslate_tts(text):
tts = gTTS(text, lang="en", tld="co.uk")
tts.save("out.mp3")
play_sound("out.mp3") #os.system("mpg123 out.mp3")
def elevenlabs_tts(text):
# get list of eleven labs voices
response = requests.get('https://api.elevenlabs.io/v1/voices',
headers={
'accept': 'application/json',
'xi-api-key': elevenlabs_api_key,
}
)
resp_jdata=json.loads(response.content)
# randomly pick one the premade voices by eleven labs
v_id = random.randrange(0, len(resp_jdata["voices"])-1, 1)
voice_id = resp_jdata["voices"][v_id]["voice_id"]
# configure voice and feed text to remote eleven labs speech synthesis engine
response = requests.post(
'https://api.elevenlabs.io/v1/text-to-speech/{}'.format(voice_id),
headers = {
'accept': 'audio/mpeg',
'xi-api-key': elevenlabs_api_key,
'Content-Type': 'application/json',
},
json = {'text': text,}
)
# create and playback sound file
with open('out.mp3', 'wb') as f:
f.write(response.content)
play_sound("out.mp3") # os.system("mpg123 out.mp3")
def hf_tts_query(payload, model_id, api_token):
# feed text to remote speech synthesis model
API_URL = f"https://api-inference.huggingface.co/models/{model_id}"
headers = {
"Authorization": f"Bearer {api_token}",
"accept": "audio/mpeg",
"Content-Type": "application/json",
}
response = requests.post(API_URL, headers=headers, json={"inputs": payload})
# create and playback sound file
with open('out.mp3', 'wb') as f:
f.write(response.content)
os.system("mpg123 out.mp3") # TODO issue with writing response content to audio file
return response.content.json()
# binds all TTS function above to one call
def do_tts(tts_engine_name: str, text: str):
'''
Multi text-to-speech synthesis utility.\n
tts_engine_name - takes any of string values 'pyttsx3','gtts','elevenlabs', 'fastspeech2' and 'speecht5_tts'.\n
text - input text string to be converted to speech.
'''
if tts_engine_name=="pyttsx3":
pyttsx3_tts(text)
elif tts_engine_name == "gtts":
gTranslate_tts(text)
elif tts_engine_name == "elevenlabs":
elevenlabs_tts(text)
elif tts_engine_name == "fastspeech2":
tts_data = hf_tts_query(text, "facebook/fastspeech2-en-ljspeech", hf_api_token)
print(f"fastspeech2 tts_data: {tts_data}")
elif tts_engine_name == "speech5":
# TODO fix error speech5 tts_data: {'error': 'text-to-speech is not a valid pipeline'}
tts_data = hf_tts_query(text, "microsoft/speecht5_tts", hf_api_token)
print(f"speecht5 tts_data: {tts_data}")
else:
gTranslate_tts(text)
# Functions for handing sound file playback
def play_sound(sound_file):
# input - sound_file specifies path to sound file in string
# mixer.init()
mixer.music.load(sound_file)
mixer.music.play()
def stop_sound_gracefully():
mixer.music.fadeout(2)
# main
def main(gpt_version):
# Input - gpt_version takes valid values of 'gpt-3' and 'gpt-3.5'
global msg_thread
no_activity_period_count = 0 # for traking idle convo sessions
while True:
# TODO find and fix cause of the ALSA dumps
with noalsaerr(): # suppress ALSA lib dumps for now. remove when dump issues are fixed
recognizer = sr.Recognizer()
# initial prompt ('KAI' in this case) to begin recording
print("Say 'KAI' to record your question...")
with sr.Microphone() as source:
source.pause_threshold = 1
recognizer.adjust_for_ambient_noise(source)
audio = recognizer.listen(source)
try:
transcription = recognizer.recognize_google(audio)
# stop sound triggered by possible previous inactivity
if no_activity_period_count > 0:
if activate_idle_state_sound:
stop_sound_gracefully()
#if transcription.lower == "kai":
print("Yes. How may I be of help?")
pyttsx3_tts("Yes. How may I be of help?")
# record audio
in_audio_filename = "input.wav"
with sr.Microphone() as source:
source.pause_threshold = 1
recognizer.adjust_for_ambient_noise(source)
audio = recognizer.listen(source,
phrase_time_limit=None,
timeout=None)
with open(in_audio_filename, "wb") as f:
f.write(audio.get_wav_data())
# transcribe audio to text
# text = sr_transcribe(in_audio_filename)
transcript = whisper_transcribe(in_audio_filename)
# add transcribed query to conversation thread
msg_thread.append({"role": "user", "content": transcript})
if transcript:
print("You asked: '{}'.".format(transcript))
pyttsx3_tts("You asked, '{}'.".format(transcript))
# generate response
if gpt_version == "gpt-3":
response = get_prompt_response(transcript)
print("GPT response: {}".format(response))
elif gpt_version == "gpt-3.5":
response = get_chat_response(msg_thread)
# add GPT-3 response to thread
msg_thread.append({"role": "assistant", "content": response})
# display message thread
print("Messages: \n\n{}".format(get_transcript(msg_thread)))
#read out response
pyttsx3_tts(response)
pyttsx3_tts("I hope that helped. You can ask your next question.")
time.sleep(2)
# reset counter
no_activity_period_count = 0
except sr.RequestError:
print("API unavailable")
pyttsx3_tts("It would appear we have a problem with access to connectivity.")
except sr.UnknownValueError:
print("Unable to recognize speech")
# speak("I could not hear you.")
# play a brief sound file for non active period
if activate_idle_state_sound:
play_sound("ambient-music-0036.mp3")
# keep track of non activity
no_activity_period_count += 1
# stops session after 5 session loops with no voice prompts from user.
if no_activity_period_count > 5:
time.sleep(2)
pyttsx3_tts("Bye for now.")
break
# main
if __name__ == "__main__":
main("gpt-3.5") # change parameter to gpt-3.5 to use the latest gpt-3.5-turbo model