Skip to content

Commit

Permalink
Merge branch 'main' of github.com:linozen/verbatim
Browse files Browse the repository at this point in the history
  • Loading branch information
linozen committed Jan 19, 2025
2 parents ee99cb3 + eb3cb3f commit 1868126
Show file tree
Hide file tree
Showing 24 changed files with 453 additions and 107 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ jobs:
COVERAGE_PATH: .

- name: Store Pull Request comment to be posted
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
if: steps.coverage_comment.outputs.COMMENT_FILE_WRITTEN == 'true'
with:
# If you use a different name, update COMMENT_ARTIFACT_NAME accordingly
Expand Down
1 change: 1 addition & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,7 @@ disable=raw-checker-failed,
too-many-instance-attributes,
no-else-return,
logging-not-lazy,
simplifiable-if-statement,

# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
Expand Down
8 changes: 0 additions & 8 deletions verbatim/audio/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import numpy as np
from numpy.typing import NDArray
from pydub import AudioSegment
from scipy.signal import resample

# Configure logger
Expand Down Expand Up @@ -126,10 +125,3 @@ def timestr_to_samples(timestr: str, sample_rate: int = 16000) -> int:

# Convert to sample index
return int(total_seconds * sample_rate)


def convert_mp3_to_wav(input_mp3, output_wav):
# Load the mp3 file
audio = AudioSegment.from_mp3(input_mp3)
# Export the audio as wav
audio.export(output_wav, format="wav")
20 changes: 20 additions & 0 deletions verbatim/audio/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import os

def convert_to_wav(
input_path: str,
working_prefix_no_ext: str,
preserve_channels: bool = False,
overwrite = True) -> str:
# pylint: disable=import-outside-toplevel
from .sources.ffmpegfileaudiosource import PyAVAudioSource
from .sources.wavsink import WavSink

converted_path = working_prefix_no_ext + ".wav"

if not overwrite and os.path.exists(converted_path) is True:
return converted_path

temp_file_audio_source = PyAVAudioSource(file_path=input_path, preserve_channels=preserve_channels)
WavSink.dump_to_wav(audio_source=temp_file_audio_source, output_path=converted_path, preserve_channels=preserve_channels)

return converted_path
10 changes: 7 additions & 3 deletions verbatim/audio/sources/audiosource.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from abc import abstractmethod
from abc import ABC, abstractmethod
from typing import Optional

from pyannote.core.annotation import Annotation
from numpy.typing import NDArray


class AudioStream:
class AudioStream(ABC):
start_offset: int = 0
diarization: Optional[Annotation] = None

Expand All @@ -32,8 +32,12 @@ def next_chunk(self, chunk_length=1) -> NDArray:
def close(self):
pass

@abstractmethod
def get_nchannels(self) -> int:
pass


class AudioSource:
class AudioSource(ABC):
source_name: str = ""

def __init__(self, source_name: str):
Expand Down
50 changes: 19 additions & 31 deletions verbatim/audio/sources/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,10 @@

from ...voices.diarize.factory import create_diarizer # Add this import
from ..audio import samples_to_seconds, timestr_to_samples
from ..convert import convert_to_wav
from .audiosource import AudioSource
from .sourceconfig import SourceConfig


def convert_to_wav(input_path: str, working_prefix_no_ext: str, preserve_stereo: bool = False) -> str:
# pylint: disable=import-outside-toplevel
from .ffmpegfileaudiosource import PyAVAudioSource
from .wavsink import WavSink

temp_file_audio_source = PyAVAudioSource(file_path=input_path, preserve_channels=preserve_stereo)

converted_path = working_prefix_no_ext + ".wav"
WavSink.dump_to_wav(audio_source=temp_file_audio_source, output_path=converted_path, preserve_stereo=preserve_stereo)
return converted_path


def compute_diarization(
file_path: str,
device: str,
Expand Down Expand Up @@ -98,10 +86,15 @@ def create_audio_source(
preserve_channels=source_config.diarization_strategy == "stereo",
)

if source_config.diarization_strategy == "stereo":
preserve_channels = True
else:
preserve_channels = False

input_source = convert_to_wav(
input_path=input_source,
working_prefix_no_ext=working_prefix_no_ext,
preserve_stereo=source_config.diarization_strategy == "stereo",
preserve_channels=preserve_channels,
)

return create_audio_source(
Expand Down Expand Up @@ -158,6 +151,7 @@ def create_audio_source(

def create_separate_speaker_sources(
*,
strategy: str = "pyannote",
input_source: str,
device: str,
source_config: SourceConfig = SourceConfig(),
Expand All @@ -169,9 +163,10 @@ def create_separate_speaker_sources(
# pylint: disable=import-outside-toplevel

if os.path.splitext(input_source)[-1] != ".wav":
converted_input_source = convert_to_wav(input_path=input_source, working_prefix_no_ext=working_prefix_no_ext)
converted_input_source = convert_to_wav(input_path=input_source, working_prefix_no_ext=working_prefix_no_ext, preserve_channels=True)
return create_separate_speaker_sources(
input_source=converted_input_source,
strategy=strategy,
device=device,
source_config=source_config,
start_time=start_time,
Expand All @@ -190,27 +185,20 @@ def create_separate_speaker_sources(
start_sample: int = timestr_to_samples(start_time) if start_time else 0
stop_sample: Optional[int] = timestr_to_samples(stop_time) if stop_time else None

from ...voices.separation import SpeakerSeparation
from .fileaudiosource import FileAudioSource

sources: List[AudioSource] = []
from ...voices.separate.factory import create_separator

with SpeakerSeparation(device=device, huggingface_token=os.getenv("HUGGINGFACE_TOKEN", "")) as separation:
diarization, speaker_wav_files = separation.separate_speakers(
with create_separator(
strategy=strategy,
device=device,
huggingface_token=os.getenv("HUGGINGFACE_TOKEN", ""),
diarization_strategy=source_config.diarization_strategy) as separation:
sources = separation.separate_speakers(
file_path=input_source,
out_rttm_file=source_config.diarization_file,
out_speaker_wav_prefix=working_prefix_no_ext,
nb_speakers=nb_speakers,
diarization_strategy=source_config.diarization_strategy,
start_sample=start_sample,
end_sample=stop_sample
)
for _speaker, speaker_file in speaker_wav_files.items():
sources.append(
FileAudioSource(
file=speaker_file,
start_sample=start_sample,
end_sample=stop_sample,
diarization=diarization,
)
)

return sources
3 changes: 3 additions & 0 deletions verbatim/audio/sources/ffmpegfileaudiosource.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,9 @@ def close(self):
self._done_decoding = True
self._sample_buffer = np.array([], dtype=np.float32)

def get_nchannels(self) -> int:
return self._stream.channels


class PyAVAudioSource(AudioSource):
"""
Expand Down
11 changes: 7 additions & 4 deletions verbatim/audio/sources/fileaudiosource.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from .audiosource import AudioSource, AudioStream
from ..audio import format_audio
from ..audio import convert_mp3_to_wav
from ..convert import convert_to_wav
from ...voices.isolation import VoiceIsolation

LOG = logging.getLogger(__name__)
Expand All @@ -20,6 +20,7 @@

class FileAudioStream(AudioStream):
source: "FileAudioSource"
stream:wave.Wave_read

def __init__(self, source: "FileAudioSource", diarization: Optional[Annotation]):
super().__init__(start_offset=source.start_sample, diarization=diarization)
Expand Down Expand Up @@ -74,6 +75,9 @@ def has_more(self):
def close(self):
self.stream.close()

def get_nchannels(self) -> int:
return self.stream.getnchannels()


class FileAudioSource(AudioSource):
diarization: Optional[Annotation]
Expand All @@ -93,9 +97,8 @@ def __init__(
self.preserve_channels = preserve_channels
file_path_no_ext, file_path_ext = os.path.splitext(self.file_path)
if file_path_ext in COMPATIBLE_FORMATS:
# Convert mp3 to wav
wav_file_path = f"{file_path_no_ext}.wav"
convert_mp3_to_wav(self.file_path, wav_file_path)
# Convert encoded audio to wav
wav_file_path = convert_to_wav(input_path=self.file_path, working_prefix_no_ext=file_path_no_ext, preserve_channels=preserve_channels)
self.file_path = wav_file_path
self.end_sample = end_sample
self.start_sample = start_sample
Expand Down
15 changes: 12 additions & 3 deletions verbatim/audio/sources/micaudiosource.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
class MicAudioStreamSoundDevice(AudioStream):
source: "MicAudioSourceSoundDevice"
audio_queue: queue.Queue
stream = None
stream:sd.InputStream

def __init__(self, source: "MicAudioSourceSoundDevice"):
super().__init__(start_offset=0, diarization=None)
Expand Down Expand Up @@ -72,6 +72,11 @@ def next_chunk(self, chunk_length=1) -> NDArray:
def has_more(self):
return True

def get_nchannels(self) -> int:
stream:sd.InputStream = self.stream
idevice, _odevice = stream.channels
return idevice


class MicAudioSourceSoundDevice(AudioSource):
def __init__(self, sampling_rate: int = 16000, frames_per_buffer: int = 1024):
Expand All @@ -87,14 +92,15 @@ class MicAudioStreamPyAudio(AudioStream):
source: "MicAudioSourcePyAudio"
p: pyaudio.PyAudio
stream: pyaudio.Stream
nchannels:int = 1

def __init__(self, source: "MicAudioSourcePyAudio"):
def __init__(self, source: "MicAudioSourcePyAudio", nchannels:int = 1):
super().__init__(start_offset=0, diarization=None)
self.source = source
self.p: pyaudio.PyAudio = pyaudio.PyAudio()
self.stream: pyaudio.Stream = self.p.open(
format=pyaudio.paInt16,
channels=1,
channels=nchannels,
rate=self.source.sampling_rate,
input=True,
frames_per_buffer=self.source.frames_per_buffer,
Expand Down Expand Up @@ -126,6 +132,9 @@ def close(self):
def has_more(self):
return True

def get_nchannels(self) -> int:
return self.nchannels


class MicAudioSourcePyAudio(AudioSource):
frames_per_iter: int
Expand Down
3 changes: 3 additions & 0 deletions verbatim/audio/sources/pcmaudiosource.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ def close(self):
def has_more(self) -> bool:
return self._has_more

def get_nchannels(self) -> int:
return self.source.channels


class PCMInputStreamAudioSource(AudioSource):
stream: BinaryIO
Expand Down
4 changes: 2 additions & 2 deletions verbatim/audio/sources/wavsink.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

class WavSink:
@staticmethod
def dump_to_wav(audio_source: AudioSource, output_path: str, sample_rate: int = 16000, preserve_stereo: bool = False):
def dump_to_wav(audio_source: AudioSource, output_path: str, sample_rate: int = 16000, preserve_channels: bool = False):
"""
Dump the entire audio content from PyAVAudioSource to a .wav file.
"""
Expand All @@ -15,7 +15,7 @@ def dump_to_wav(audio_source: AudioSource, output_path: str, sample_rate: int =
# pylint: disable=no-member
with wave.open(output_path, "w") as wav_file:
# Set WAV parameters
num_channels = 2 if preserve_stereo else 1 # Stereo or mono
num_channels = audio_stream.get_nchannels() if preserve_channels else 1 # Stereo or mono
sample_width = 2 # 16-bit PCM
frame_rate = sample_rate # Target sample rate
wav_file.setnchannels(num_channels)
Expand Down
5 changes: 4 additions & 1 deletion verbatim/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,9 @@ def __call__(self, parser, namespace, values, option_string=None):
)
parser.add_argument(
"--separate",
action="store_true",
nargs="?",
action=OptionalValueAction,
default=None,
help="Enables speaker voice separation and process each speaker separately",
)
parser.add_argument(
Expand Down Expand Up @@ -324,6 +326,7 @@ def __call__(self, parser, namespace, values, option_string=None):
audio_sources: List[AudioSource] = []
if args.separate:
audio_sources += create_separate_speaker_sources(
strategy=args.separate or "pyannote",
source_config=source_config,
device=config.device,
input_source=source_path,
Expand Down
Loading

0 comments on commit 1868126

Please sign in to comment.