-
-
Notifications
You must be signed in to change notification settings - Fork 31
/
audio_processing_recording.py
633 lines (532 loc) · 35.7 KB
/
audio_processing_recording.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
# -*- encoding: utf-8 -*-
import time
import traceback
import platform
if platform.system() == 'Windows':
import pyaudiowpatch as pyaudio
else:
import pyaudio
import numpy as np
import torch
import audio_tools
import keyboard
import threading
from pathlib import Path
from whisper import audio as whisper_audio
import wave
import Utilities
import Models.STS.SpeakerDiarization as speaker_diarization
SAMPLE_RATE = whisper_audio.SAMPLE_RATE
CHUNK = int(SAMPLE_RATE / 10)
CHANNELS = 1
MAIN_APP_BEFORE_CALLBACK_FUNC_LISTS = {
'before_callback_called_func': [], # called on every audio frame
'before_recording_send_to_queue_callback_func': [], # called before final audio frame is sent to queue for AI to process
'before_recording_starts_callback_func': [], # called when recording starts
'before_recording_running_callback_func': [], # called when recording is running
}
def _main_app_before_callback_called_for_list(func_call_name: str, callback_obj=None):
if func_call_name not in MAIN_APP_BEFORE_CALLBACK_FUNC_LISTS:
return
for func in MAIN_APP_BEFORE_CALLBACK_FUNC_LISTS[func_call_name]:
if func is not None and callable(func):
func(callback_obj)
return
def main_app_before_callback_called(callback_obj):
_main_app_before_callback_called_for_list('before_callback_called_func', callback_obj)
def main_app_before_recording_send_to_queue_callback(callback_obj):
_main_app_before_callback_called_for_list('before_recording_send_to_queue_callback_func', callback_obj)
def main_app_before_recording_starts_callback(callback_obj):
_main_app_before_callback_called_for_list('before_recording_starts_callback_func', callback_obj)
def main_app_before_recording_running_callback(callback_obj):
_main_app_before_callback_called_for_list('before_recording_running_callback_func', callback_obj)
# Provided by Alexander Veysov
def int2float(sound):
abs_max = np.abs(sound).max()
sound = sound.astype('float32')
if abs_max > 0:
sound *= 1 / abs_max
sound = sound.squeeze() # depends on the use case
return sound
def save_to_wav(data, filename, sample_rate, channels=1):
with wave.open(filename, 'wb') as wf:
wf.setnchannels(channels)
wf.setsampwidth(2) # Assuming 16-bit audio
wf.setframerate(sample_rate)
wf.writeframes(data)
def call_plugin_sts(plugins, wavefiledata, sample_rate):
for plugin_inst in plugins:
if plugin_inst.is_enabled(False) and hasattr(plugin_inst, 'sts'):
try:
plugin_inst.sts(wavefiledata, sample_rate)
except Exception as e:
print("Error in plugin sts method: " + str(e))
def process_audio_chunk(audio_chunk, sample_rate, vad_model=None):
audio_int16 = np.frombuffer(audio_chunk, np.int16)
audio_float32 = int2float(audio_int16)
if vad_model is not None:
new_confidence = vad_model.run_vad(torch.from_numpy(audio_float32), sample_rate).item()
else:
new_confidence = 9.9
peak_amplitude = np.max(np.abs(audio_int16))
# clear the variables
audio_int16 = None
del audio_int16
audio_float32 = None
del audio_float32
return new_confidence, peak_amplitude
class AudioProcessor:
last_callback_time = time.time()
def __init__(self,
default_sample_rate=SAMPLE_RATE,
start_rec_on_volume_threshold=None,
push_to_talk_key=None,
keyboard_rec_force_stop=None,
vad_model=None,
needs_sample_rate_conversion=False,
recorded_sample_rate=None,
input_channel_num=2,
plugins=None,
audio_enhancer=None,
osc_ip=None,
osc_port=None,
chunk=None,
channels=None,
sample_format=None,
audio_queue=None,
settings=None,
typing_indicator_function=None,
before_callback_called_func=None,
before_recording_send_to_queue_callback_func=None,
before_recording_starts_callback_func=None,
before_recording_running_callback_func=None,
before_recording_stopped_callback_func=None,
verbose=False
):
self.frames = []
self.default_sample_rate = default_sample_rate
self.previous_audio_chunk = None
self.start_rec_on_volume_threshold = start_rec_on_volume_threshold
self.push_to_talk_key = push_to_talk_key
self.keyboard_rec_force_stop = keyboard_rec_force_stop
self.vad_model = vad_model
self.needs_sample_rate_conversion = needs_sample_rate_conversion
self.recorded_sample_rate = recorded_sample_rate
self.input_channel_num = input_channel_num
self.plugins = plugins
self.audio_enhancer = audio_enhancer
self.osc_ip = osc_ip
self.osc_port = osc_port
self.verbose = verbose
self.start_time = time.time()
self.pause_time = time.time()
self.intermediate_time_start = time.time()
self.block_size_samples = int(
self.default_sample_rate * 0.400) # calculate block size in samples. (0.400 is the default block size of pyloudnorm)
self.chunk = chunk
self.channels = channels
self.sample_format = sample_format
self.audio_queue = audio_queue
self.settings = settings
self.diarization_model = None
self.typing_indicator_function = typing_indicator_function
self._new_speaker = False
self.new_speaker_audio = None
self.before_callback_called_func = before_callback_called_func
self.before_recording_send_to_queue_callback_func = before_recording_send_to_queue_callback_func
self.before_recording_starts_callback_func = before_recording_starts_callback_func
self.before_recording_running_callback_func = before_recording_running_callback_func
self.audio_filter_buffer = None
# run callback after timeout even if no audio was detected (and such callback not called by pyAudio)
# self.timer_reset_event = threading.Event()
# self.timer_thread = threading.Thread(target=self.timer_expired)
# self.timer_thread.start()
# self.timer_reset_event.set()
# self.last_callback_time = time.time()
def should_start_recording(self, peak_amplitude, energy, new_confidence, confidence_threshold, keyboard_key=None):
return ((keyboard_key is not None and keyboard.is_pressed(
keyboard_key)) or (0 < energy <= peak_amplitude and new_confidence >= confidence_threshold))
def should_stop_recording(self, new_confidence, confidence_threshold, peak_amplitude, energy, pause_time, pause,
keyboard_key=None):
return (keyboard_key is not None and not keyboard.is_pressed(keyboard_key)) or (
0 < energy > peak_amplitude and (
new_confidence < confidence_threshold or confidence_threshold == 0.0) and (
time.time() - pause_time) > pause > 0.0)
# The function to call when the timer expires
# def timer_expired(self):
# while True:
# current_time = time.time()
# time_since_last_callback = current_time - self.last_callback_time
# if self.recorded_sample_rate is not None:
# # wait double the chunk size to not accidentally call callback twice
# self.timer_reset_event.wait(timeout=(self.chunk / self.recorded_sample_rate)*2)
# if time_since_last_callback >= (self.chunk / self.recorded_sample_rate)*2 and len(self.frames) > 0:
# #print("Timer expired. Triggering callback.")
# try:
# print("Timer expired. Triggering callback.")
# self.callback(None, None, None, None)
# except Exception as e:
# print(e)
# self.timer_reset_event.clear()
def callback(self, in_data, frame_count, time_info, status):
# Reset the timer each time the callback is triggered
# self.last_callback_time = time.time()
# self.timer_reset_event.set()
if self.before_callback_called_func is not None and callable(self.before_callback_called_func):
self.before_callback_called_func(self)
if not self.settings.GetOption("stt_enabled"):
return None, pyaudio.paContinue
# disable gradient calculation
with torch.no_grad():
phrase_time_limit = self.settings.GetOption("phrase_time_limit")
pause = self.settings.GetOption("pause")
energy = self.settings.GetOption("energy")
if phrase_time_limit == 0:
phrase_time_limit = None
silence_cutting_enabled = self.settings.GetOption("silence_cutting_enabled")
silence_offset = self.settings.GetOption("silence_offset")
max_silence_length = self.settings.GetOption("max_silence_length")
keep_silence_length = self.settings.GetOption("keep_silence_length")
normalize_enabled = self.settings.GetOption("normalize_enabled")
normalize_lower_threshold = self.settings.GetOption("normalize_lower_threshold")
normalize_upper_threshold = self.settings.GetOption("normalize_upper_threshold")
normalize_gain_factor = self.settings.GetOption("normalize_gain_factor")
use_speaker_diarization = self.settings.GetOption("speaker_diarization")
if use_speaker_diarization and self.diarization_model is None:
self.diarization_model = speaker_diarization.SpeakerDiarization()
speaker_change_split = self.settings.GetOption("speaker_change_split")
min_speakers = self.settings.GetOption("min_speakers")
max_speakers = self.settings.GetOption("max_speakers")
min_speaker_length = self.settings.GetOption("min_speaker_length")
clip_duration = phrase_time_limit
fps = 0
if clip_duration is not None:
fps = int(self.recorded_sample_rate / CHUNK * clip_duration)
end_time = time.time()
elapsed_time = end_time - self.start_time
elapsed_intermediate_time = end_time - self.intermediate_time_start
confidence_threshold = float(self.settings.GetOption("vad_confidence_threshold"))
########
# create denoise ring buffer
########
if self.audio_filter_buffer is None and self.settings.GetOption("denoise_audio") != "" and self.settings.GetOption("denoise_audio_before_trigger") and self.audio_enhancer is not None:
# calculate block size for 10 seconds audio
ringbuffer_channel_num = 1
ringbuffer_audio_bytes_per_sample = 2
ringbuffer_audio_length = 10 # ringbuffer audio length in seconds
ringbuffer_size = int(self.default_sample_rate * ringbuffer_audio_length * ringbuffer_channel_num * ringbuffer_audio_bytes_per_sample)
print("sample size for noise cancelling ring buffer:", ringbuffer_size)
self.audio_filter_buffer = audio_tools.CircularByteBuffer(ringbuffer_size)
# copy bytes into variables
test_audio_chunk = in_data[:]
audio_chunk = in_data[:]
# special case which seems to be needed for WASAPI
if self.needs_sample_rate_conversion and test_audio_chunk is not None:
test_audio_chunk = audio_tools.resample_audio(test_audio_chunk, self.recorded_sample_rate,
self.default_sample_rate, target_channels=1,
input_channels=self.input_channel_num).tobytes()
new_confidence, peak_amplitude = 0, 0
if test_audio_chunk is not None:
# denoise audio
if self.settings.GetOption("denoise_audio") != "" and self.settings.GetOption("denoise_audio_before_trigger") and self.audio_enhancer is not None and self.audio_filter_buffer is not None:
# @TODO: audio chunks probably need overlay to prevent crackling noise (possibly buffer underrun because of the processing time of noise filter?)
self.audio_filter_buffer.append(test_audio_chunk)
new_confidence, peak_amplitude = process_audio_chunk(test_audio_chunk, self.default_sample_rate,
self.vad_model)
if 0 < energy <= peak_amplitude and new_confidence >= confidence_threshold:
#print("denoising...")
########
# denoise using a ring buffer (to always work over a larger audio snipped.
########
test_audio_buffered = self.audio_filter_buffer.get_ordered_buffer()
test_audio_chunk_buffered_enhanced = self.audio_enhancer.enhance_audio(test_audio_buffered, sample_rate=self.default_sample_rate, output_sample_rate=self.default_sample_rate)
chunk_length = len(test_audio_chunk)
test_audio_chunk_denoise = test_audio_chunk_buffered_enhanced[-chunk_length:].tobytes()
# check confidence and peak amplitude again with denoised audio chunk
new_confidence_denoised, peak_amplitude_denoised = process_audio_chunk(test_audio_chunk_denoise, self.default_sample_rate,
self.vad_model)
########
# denoise only on the single chunk
########
# test_audio_chunk_buffered_enhanced = self.audio_enhancer.enhance_audio(test_audio_chunk, sample_rate=self.default_sample_rate, output_sample_rate=self.default_sample_rate)
# new_confidence_denoised, peak_amplitude_denoised = process_audio_chunk(test_audio_chunk_buffered_enhanced, self.default_sample_rate,
# self.vad_model)
# set the lower value depending on if the initial or denoised values are lower
if new_confidence_denoised < new_confidence:
new_confidence = new_confidence_denoised
if peak_amplitude_denoised < peak_amplitude:
peak_amplitude = peak_amplitude_denoised
else:
try:
new_confidence, peak_amplitude = process_audio_chunk(test_audio_chunk, self.default_sample_rate,
self.vad_model)
except Exception as e:
print(f"Error processing audio_chunk: {e}")
traceback.print_exc()
# put frames with recognized speech into a list and send to whisper
if (clip_duration is not None and len(self.frames) > fps) or (
elapsed_time > pause > 0.0 and len(self.frames) > 0) or (
self.keyboard_rec_force_stop and self.push_to_talk_key is not None and not keyboard.is_pressed(
self.push_to_talk_key) and len(self.frames) > 0) or (
use_speaker_diarization and self._new_speaker and self.diarization_model is not None and len(
self.frames) > 0):
if self.before_recording_send_to_queue_callback_func is not None and callable(self.before_recording_send_to_queue_callback_func):
self.before_recording_send_to_queue_callback_func(self)
#clip = []
# merge all frames to one audio clip
#for i in range(0, len(self.frames)):
# if self.frames[i] is not None:
# clip.append(self.frames[i])
#if len(clip) > 0:
# wavefiledata = b''.join(clip)
if len(self.frames) > 0:
wavefiledata = b''.join(self.frames)
else:
return None, pyaudio.paContinue
if self.needs_sample_rate_conversion:
wavefiledata = audio_tools.resample_audio(wavefiledata, self.recorded_sample_rate,
self.default_sample_rate, target_channels=1,
input_channels=self.input_channel_num).tobytes()
# normalize audio (and make sure it's longer or equal the default block size by pyloudnorm)
if normalize_enabled and len(wavefiledata) >= self.block_size_samples:
wavefiledata = audio_tools.convert_audio_datatype_to_float(np.frombuffer(wavefiledata, np.int16))
wavefiledata, lufs = audio_tools.normalize_audio_lufs(
wavefiledata, self.default_sample_rate, normalize_lower_threshold, normalize_upper_threshold,
normalize_gain_factor, verbose=self.verbose
)
wavefiledata = audio_tools.convert_audio_datatype_to_integer(wavefiledata, np.int16)
#wavefiledata = wavefiledata.tobytes()
# remove silence from audio
if silence_cutting_enabled:
if type(wavefiledata) is bytes:
wavefiledata_np = np.frombuffer(wavefiledata, np.int16)
else:
wavefiledata_np = wavefiledata
if len(wavefiledata_np) >= self.block_size_samples:
wavefiledata = audio_tools.remove_silence_parts(
wavefiledata_np, self.default_sample_rate,
silence_offset=silence_offset, max_silence_length=max_silence_length,
keep_silence_length=keep_silence_length,
verbose=self.verbose
)
#wavefiledata = wavefiledata.tobytes()
if type(wavefiledata) is np.array:
wavefiledata = wavefiledata.tobytes()
# debug save of audio clip
# save_to_wav(wavefiledata, "resampled_audio_chunk.wav", self.default_sample_rate)
# check if the full audio clip is above the confidence threshold
vad_clip_test = self.settings.GetOption("vad_on_full_clip")
full_audio_confidence = 0.
if vad_clip_test:
audio_full_int16 = np.frombuffer(wavefiledata, np.int16)
audio_full_float32 = int2float(audio_full_int16)
full_audio_confidence = self.vad_model.run_vad(torch.from_numpy(audio_full_float32),
self.default_sample_rate).item()
print(full_audio_confidence)
if ((not vad_clip_test) or (vad_clip_test and full_audio_confidence >= confidence_threshold)) and len(
wavefiledata) > 0:
# denoise audio
if self.settings.GetOption("denoise_audio") != "" and self.audio_enhancer is not None:
wavefiledata = self.audio_enhancer.enhance_audio(wavefiledata).tobytes()
# call sts plugin methods
if self.plugins is not None:
call_plugin_sts(self.plugins, wavefiledata, self.default_sample_rate)
wave_file_bytes = audio_tools.audio_bytes_to_wav(wavefiledata, channels=CHANNELS,
sample_rate=SAMPLE_RATE)
#if self.diarization_model is not None and use_speaker_diarization:
# wave_file_bytes = self.diarization_model.diarize(wave_file_bytes, min_speakers=min_speakers,
# max_speakers=max_speakers, min_segment_length=min_speaker_length)
if isinstance(wave_file_bytes, list):
for audio_segment in wave_file_bytes:
self.audio_queue.put(
{'time': time.time_ns(), 'data': audio_segment, 'final': True, 'settings': self.settings, 'plugins': self.plugins})
else:
self.audio_queue.put(
{'time': time.time_ns(), 'data': wave_file_bytes, 'final': True, 'settings': self.settings, 'plugins': self.plugins})
# vad_iterator.reset_states() # reset model states after each audio
# write wav file if configured to do so
transcription_save_audio_dir = self.settings.GetOption("transcription_save_audio_dir")
if transcription_save_audio_dir is not None and transcription_save_audio_dir != "":
start_time_str = Utilities.ns_to_datetime(time.time_ns(), formatting='%Y-%m-%d %H_%M_%S-%f')
audio_file_name = f"audio_transcript_{start_time_str}.wav"
transcription_save_audio_dir = Path(transcription_save_audio_dir)
audio_file_path = transcription_save_audio_dir / audio_file_name
threading.Thread(
target=save_to_wav,
args=(wavefiledata, str(audio_file_path.resolve()), self.default_sample_rate,)
).start()
# set typing indicator for VRChat and Websocket clients
if self.typing_indicator_function is not None:
typing_indicator_thread = threading.Thread(target=self.typing_indicator_function,
args=(self.osc_ip, self.osc_port, True))
typing_indicator_thread.start()
self.frames = []
self.start_time = time.time()
self.intermediate_time_start = time.time()
self.keyboard_rec_force_stop = False
if self.new_speaker_audio is not None:
print("Added new speaker audio of new speaker to new audio recording chunk")
self.frames.append(self.new_speaker_audio)
self.new_speaker_audio = None
self._new_speaker = False
if audio_chunk is None:
return None, pyaudio.paContinue
# set start recording variable to true if the volume and voice confidence is above the threshold
if self.should_start_recording(peak_amplitude, energy, new_confidence, confidence_threshold,
keyboard_key=self.push_to_talk_key):
if self.before_recording_starts_callback_func is not None and callable(self.before_recording_starts_callback_func):
self.before_recording_starts_callback_func(self)
if self.verbose:
print("start recording - new_confidence: " + str(new_confidence) + " peak_amplitude: " + str(
peak_amplitude))
if not self.start_rec_on_volume_threshold:
# start processing_start event
if self.typing_indicator_function is not None:
typing_indicator_thread = threading.Thread(target=self.typing_indicator_function,
args=(self.osc_ip, self.osc_port, True))
typing_indicator_thread.start()
if self.push_to_talk_key is not None and keyboard.is_pressed(self.push_to_talk_key):
self.keyboard_rec_force_stop = True
self.start_rec_on_volume_threshold = True
self.pause_time = time.time()
# append audio frame to the list if the recording var is set and voice confidence is above the threshold (So it only adds the audio parts with speech)
if self.start_rec_on_volume_threshold and new_confidence >= confidence_threshold:
if self.before_recording_running_callback_func is not None and callable(self.before_recording_running_callback_func):
self.before_recording_running_callback_func(self)
if self.verbose:
print("add chunk - new_confidence: " + str(new_confidence) + " peak_amplitude: " + str(
peak_amplitude))
# append previous audio chunk to improve recognition on too late audio recording starts
if self.previous_audio_chunk is not None:
self.frames.append(self.previous_audio_chunk)
self.frames.append(audio_chunk)
self.start_time = time.time()
if self.settings.GetOption("realtime"):
#clip = []
frame_count = len(self.frames)
# send realtime intermediate results every x frames and every x seconds (making sure its at least x frame length)
if frame_count % self.settings.GetOption(
"realtime_frame_multiply") == 0 and elapsed_intermediate_time > self.settings.GetOption(
"realtime_frequency_time"):
# set typing indicator for VRChat but not websocket
if self.typing_indicator_function is not None:
typing_indicator_thread = threading.Thread(target=self.typing_indicator_function,
args=(self.osc_ip, self.osc_port, False))
typing_indicator_thread.start()
# merge all frames to one audio clip
#for i in range(0, len(self.frames)):
# clip.append(self.frames[i])
#if len(clip) > 0:
if len(self.frames) > 0:
wavefiledata = b''.join(self.frames)
else:
return None, pyaudio.paContinue
if self.needs_sample_rate_conversion:
wavefiledata = audio_tools.resample_audio(wavefiledata, self.recorded_sample_rate,
self.default_sample_rate, target_channels=1,
input_channels=self.input_channel_num).tobytes()
# normalize audio (and make sure it's longer or equal the default block size by pyloudnorm)
if normalize_enabled and len(wavefiledata) >= self.block_size_samples:
wavefiledata = audio_tools.convert_audio_datatype_to_float(
np.frombuffer(wavefiledata, np.int16))
wavefiledata, lufs = audio_tools.normalize_audio_lufs(
wavefiledata, self.default_sample_rate, normalize_lower_threshold,
normalize_upper_threshold, normalize_gain_factor,
verbose=self.verbose
)
wavefiledata = audio_tools.convert_audio_datatype_to_integer(wavefiledata, np.int16)
#wavefiledata = wavefiledata.tobytes()
# remove silence from audio
if silence_cutting_enabled:
if type(wavefiledata) is bytes:
wavefiledata_np = np.frombuffer(wavefiledata, np.int16)
else:
wavefiledata_np = wavefiledata
#wavefiledata_np = np.frombuffer(wavefiledata, np.int16)
if len(wavefiledata_np) >= self.block_size_samples:
wavefiledata = audio_tools.remove_silence_parts(
wavefiledata_np, self.default_sample_rate,
silence_offset=silence_offset, max_silence_length=max_silence_length,
keep_silence_length=keep_silence_length,
verbose=self.verbose
)
#wavefiledata = wavefiledata.tobytes()
if type(wavefiledata) is np.array:
wavefiledata = wavefiledata.tobytes()
if wavefiledata is not None and len(wavefiledata) > 0:
# denoise audio
if self.settings.GetOption("denoise_audio") != "" and self.audio_enhancer is not None:
wavefiledata = self.audio_enhancer.enhance_audio(wavefiledata).tobytes()
wave_file_bytes = audio_tools.audio_bytes_to_wav(wavefiledata, channels=CHANNELS,
sample_rate=SAMPLE_RATE)
# write wav file if configured to do so (debugging!!!)
transcription_save_audio_dir = self.settings.GetOption("transcription_save_audio_dir")
if transcription_save_audio_dir is not None and transcription_save_audio_dir != "":
start_time_str = Utilities.ns_to_datetime(time.time_ns(), formatting='%Y-%m-%d %H_%M_%S-%f')
audio_file_name = f"audio_transcript_{start_time_str}_intermediate.wav"
transcription_save_audio_dir = Path(transcription_save_audio_dir)
audio_file_path = transcription_save_audio_dir / audio_file_name
threading.Thread(
target=save_to_wav,
args=(wavefiledata, str(audio_file_path.resolve()), self.default_sample_rate,)
).start()
tmp_wave_file_bytes = None
if self.diarization_model is not None and use_speaker_diarization:
#tmp_wave_file_bytes = b''.join(self.frames)
tmp_wave_file_bytes = wavefiledata
tmp_wave_file_bytes = self.diarization_model.diarize(tmp_wave_file_bytes,
sample_rate=self.default_sample_rate,
min_speakers=min_speakers,
max_speakers=max_speakers,
min_segment_length=min_speaker_length,
bytes_sample_rate=self.recorded_sample_rate,
bytes_channel_num=self.input_channel_num)
if isinstance(tmp_wave_file_bytes, list) and tmp_wave_file_bytes:
print("multiple speaker audio detected")
#self.audio_queue.put(
# {'time': time.time_ns(), 'data': tmp_wave_file_bytes[-1], 'final': False})
if speaker_change_split and len(tmp_wave_file_bytes) > 1:
print("detected new speaker")
#self.frames = [audio_tools.split_audio_with_padding(frame, CHUNK, merge_to_bytes=True) for frame in wave_file_bytes[:-1]]
self.frames = [frame for frame in tmp_wave_file_bytes[:-1]]
#self.frames = [audio_tools.resample_audio(frame,
# recorded_sample_rate=self.default_sample_rate,
# target_sample_rate=self.recorded_sample_rate,
# target_channels=self.input_channel_num,
# input_channels=self.input_channel_num,
# dtype="int16",
# ) for frame in wave_file_bytes[:-1]]
#self.new_speaker_audio = audio_tools.split_audio_with_padding(wave_file_bytes[-1], CHUNK, merge_to_bytes=True)
self.new_speaker_audio = tmp_wave_file_bytes[-1]
##self.new_speaker_audio = audio_tools.split_audio_with_padding(wave_file_bytes[-1], CHUNK)
# remove last element from frames list
#self.frames.pop()
# convert into bytes per audio frame
##self.frames = audio_tools.split_audio_with_padding(wave_file_bytes[0], CHUNK)
self._new_speaker = True
#elif not isinstance(wave_file_bytes, list):
else:
self.audio_queue.put(
{'time': time.time_ns(), 'data': wave_file_bytes, 'final': False, 'settings': self.settings, 'plugins': self.plugins})
else:
self.frames = []
self.intermediate_time_start = time.time()
# stop recording if no speech is detected for pause seconds
if self.should_stop_recording(new_confidence, confidence_threshold, peak_amplitude, energy, self.pause_time,
pause,
keyboard_key=self.push_to_talk_key):
self.start_rec_on_volume_threshold = False
self.intermediate_time_start = time.time()
if self.push_to_talk_key is not None and not keyboard.is_pressed(
self.push_to_talk_key) and self.keyboard_rec_force_stop:
self.keyboard_rec_force_stop = True
else:
self.keyboard_rec_force_stop = False
# save chunk as previous audio chunk to reuse later
if not self.start_rec_on_volume_threshold and (
new_confidence < confidence_threshold or confidence_threshold == 0.0):
self.previous_audio_chunk = audio_chunk
else:
self.previous_audio_chunk = None
# self.last_callback_time = time.time()
return in_data, pyaudio.paContinue