Merge branch 'main' of github.com:linozen/verbatim

linozen · Jan 19, 2025 · 1868126 · 1868126
2 parents ee99cb3 + eb3cb3f
commit 1868126
Show file tree

Hide file tree

Showing 24 changed files with 453 additions and 107 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -84,7 +84,7 @@ jobs:
           COVERAGE_PATH: .
 
       - name: Store Pull Request comment to be posted
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         if: steps.coverage_comment.outputs.COMMENT_FILE_WRITTEN == 'true'
         with:
           # If you use a different name, update COMMENT_ARTIFACT_NAME accordingly

diff --git a/.pylintrc b/.pylintrc
@@ -444,6 +444,7 @@ disable=raw-checker-failed,
         too-many-instance-attributes,
         no-else-return,
         logging-not-lazy,
+        simplifiable-if-statement,
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option

diff --git a/verbatim/audio/audio.py b/verbatim/audio/audio.py
@@ -4,7 +4,6 @@
 
 import numpy as np
 from numpy.typing import NDArray
-from pydub import AudioSegment
 from scipy.signal import resample
 
 # Configure logger
@@ -126,10 +125,3 @@ def timestr_to_samples(timestr: str, sample_rate: int = 16000) -> int:
 
     # Convert to sample index
     return int(total_seconds * sample_rate)
-
-
-def convert_mp3_to_wav(input_mp3, output_wav):
-    # Load the mp3 file
-    audio = AudioSegment.from_mp3(input_mp3)
-    # Export the audio as wav
-    audio.export(output_wav, format="wav")
diff --git a/verbatim/audio/convert.py b/verbatim/audio/convert.py
@@ -0,0 +1,20 @@
+import os
+
+def convert_to_wav(
+        input_path: str,
+        working_prefix_no_ext: str,
+        preserve_channels: bool = False,
+        overwrite = True) -> str:
+    # pylint: disable=import-outside-toplevel
+    from .sources.ffmpegfileaudiosource import PyAVAudioSource
+    from .sources.wavsink import WavSink
+
+    converted_path = working_prefix_no_ext + ".wav"
+
+    if not overwrite and os.path.exists(converted_path) is True:
+        return converted_path
+
+    temp_file_audio_source = PyAVAudioSource(file_path=input_path, preserve_channels=preserve_channels)
+    WavSink.dump_to_wav(audio_source=temp_file_audio_source, output_path=converted_path, preserve_channels=preserve_channels)
+
+    return converted_path
diff --git a/verbatim/audio/sources/audiosource.py b/verbatim/audio/sources/audiosource.py
@@ -1,11 +1,11 @@
-from abc import abstractmethod
+from abc import ABC, abstractmethod
 from typing import Optional
 
 from pyannote.core.annotation import Annotation
 from numpy.typing import NDArray
 
 
-class AudioStream:
+class AudioStream(ABC):
     start_offset: int = 0
     diarization: Optional[Annotation] = None
 
@@ -32,8 +32,12 @@ def next_chunk(self, chunk_length=1) -> NDArray:
     def close(self):
         pass
 
+    @abstractmethod
+    def get_nchannels(self) -> int:
+        pass
+
 
-class AudioSource:
+class AudioSource(ABC):
     source_name: str = ""
 
     def __init__(self, source_name: str):

diff --git a/verbatim/audio/sources/factory.py b/verbatim/audio/sources/factory.py
@@ -8,22 +8,10 @@
 
 from ...voices.diarize.factory import create_diarizer  # Add this import
 from ..audio import samples_to_seconds, timestr_to_samples
+from ..convert import convert_to_wav
 from .audiosource import AudioSource
 from .sourceconfig import SourceConfig
 
-
-def convert_to_wav(input_path: str, working_prefix_no_ext: str, preserve_stereo: bool = False) -> str:
-    # pylint: disable=import-outside-toplevel
-    from .ffmpegfileaudiosource import PyAVAudioSource
-    from .wavsink import WavSink
-
-    temp_file_audio_source = PyAVAudioSource(file_path=input_path, preserve_channels=preserve_stereo)
-
-    converted_path = working_prefix_no_ext + ".wav"
-    WavSink.dump_to_wav(audio_source=temp_file_audio_source, output_path=converted_path, preserve_stereo=preserve_stereo)
-    return converted_path
-
-
 def compute_diarization(
     file_path: str,
     device: str,
@@ -98,10 +86,15 @@ def create_audio_source(
                 preserve_channels=source_config.diarization_strategy == "stereo",
             )
 
+        if source_config.diarization_strategy == "stereo":
+            preserve_channels = True
+        else:
+            preserve_channels = False
+
         input_source = convert_to_wav(
             input_path=input_source,
             working_prefix_no_ext=working_prefix_no_ext,
-            preserve_stereo=source_config.diarization_strategy == "stereo",
+            preserve_channels=preserve_channels,
         )
 
         return create_audio_source(
@@ -158,6 +151,7 @@ def create_audio_source(
 
 def create_separate_speaker_sources(
     *,
+    strategy: str = "pyannote",
     input_source: str,
     device: str,
     source_config: SourceConfig = SourceConfig(),
@@ -169,9 +163,10 @@ def create_separate_speaker_sources(
     # pylint: disable=import-outside-toplevel
 
     if os.path.splitext(input_source)[-1] != ".wav":
-        converted_input_source = convert_to_wav(input_path=input_source, working_prefix_no_ext=working_prefix_no_ext)
+        converted_input_source = convert_to_wav(input_path=input_source, working_prefix_no_ext=working_prefix_no_ext, preserve_channels=True)
         return create_separate_speaker_sources(
             input_source=converted_input_source,
+            strategy=strategy,
             device=device,
             source_config=source_config,
             start_time=start_time,
@@ -190,27 +185,20 @@ def create_separate_speaker_sources(
     start_sample: int = timestr_to_samples(start_time) if start_time else 0
     stop_sample: Optional[int] = timestr_to_samples(stop_time) if stop_time else None
 
-    from ...voices.separation import SpeakerSeparation
-    from .fileaudiosource import FileAudioSource
-
-    sources: List[AudioSource] = []
+    from ...voices.separate.factory import create_separator
 
-    with SpeakerSeparation(device=device, huggingface_token=os.getenv("HUGGINGFACE_TOKEN", "")) as separation:
-        diarization, speaker_wav_files = separation.separate_speakers(
+    with create_separator(
+        strategy=strategy,
+        device=device,
+        huggingface_token=os.getenv("HUGGINGFACE_TOKEN", ""),
+        diarization_strategy=source_config.diarization_strategy) as separation:
+        sources = separation.separate_speakers(
             file_path=input_source,
             out_rttm_file=source_config.diarization_file,
             out_speaker_wav_prefix=working_prefix_no_ext,
             nb_speakers=nb_speakers,
-            diarization_strategy=source_config.diarization_strategy,
+            start_sample=start_sample,
+            end_sample=stop_sample
         )
-        for _speaker, speaker_file in speaker_wav_files.items():
-            sources.append(
-                FileAudioSource(
-                    file=speaker_file,
-                    start_sample=start_sample,
-                    end_sample=stop_sample,
-                    diarization=diarization,
-                )
-            )
 
     return sources
diff --git a/verbatim/audio/sources/ffmpegfileaudiosource.py b/verbatim/audio/sources/ffmpegfileaudiosource.py
@@ -154,6 +154,9 @@ def close(self):
         self._done_decoding = True
         self._sample_buffer = np.array([], dtype=np.float32)
 
+    def get_nchannels(self) -> int:
+        return self._stream.channels
+
 
 class PyAVAudioSource(AudioSource):
     """

diff --git a/verbatim/audio/sources/fileaudiosource.py b/verbatim/audio/sources/fileaudiosource.py
@@ -10,7 +10,7 @@
 
 from .audiosource import AudioSource, AudioStream
 from ..audio import format_audio
-from ..audio import convert_mp3_to_wav
+from ..convert import convert_to_wav
 from ...voices.isolation import VoiceIsolation
 
 LOG = logging.getLogger(__name__)
@@ -20,6 +20,7 @@
 
 class FileAudioStream(AudioStream):
     source: "FileAudioSource"
+    stream:wave.Wave_read
 
     def __init__(self, source: "FileAudioSource", diarization: Optional[Annotation]):
         super().__init__(start_offset=source.start_sample, diarization=diarization)
@@ -74,6 +75,9 @@ def has_more(self):
     def close(self):
         self.stream.close()
 
+    def get_nchannels(self) -> int:
+        return self.stream.getnchannels()
+
 
 class FileAudioSource(AudioSource):
     diarization: Optional[Annotation]
@@ -93,9 +97,8 @@ def __init__(
         self.preserve_channels = preserve_channels
         file_path_no_ext, file_path_ext = os.path.splitext(self.file_path)
         if file_path_ext in COMPATIBLE_FORMATS:
-            # Convert mp3 to wav
-            wav_file_path = f"{file_path_no_ext}.wav"
-            convert_mp3_to_wav(self.file_path, wav_file_path)
+            # Convert encoded audio to wav
+            wav_file_path = convert_to_wav(input_path=self.file_path, working_prefix_no_ext=file_path_no_ext,  preserve_channels=preserve_channels)
             self.file_path = wav_file_path
         self.end_sample = end_sample
         self.start_sample = start_sample

diff --git a/verbatim/audio/sources/micaudiosource.py b/verbatim/audio/sources/micaudiosource.py
@@ -16,7 +16,7 @@
 class MicAudioStreamSoundDevice(AudioStream):
     source: "MicAudioSourceSoundDevice"
     audio_queue: queue.Queue
-    stream = None
+    stream:sd.InputStream
 
     def __init__(self, source: "MicAudioSourceSoundDevice"):
         super().__init__(start_offset=0, diarization=None)
@@ -72,6 +72,11 @@ def next_chunk(self, chunk_length=1) -> NDArray:
     def has_more(self):
         return True
 
+    def get_nchannels(self) -> int:
+        stream:sd.InputStream = self.stream
+        idevice, _odevice = stream.channels
+        return idevice
+
 
 class MicAudioSourceSoundDevice(AudioSource):
     def __init__(self, sampling_rate: int = 16000, frames_per_buffer: int = 1024):
@@ -87,14 +92,15 @@ class MicAudioStreamPyAudio(AudioStream):
     source: "MicAudioSourcePyAudio"
     p: pyaudio.PyAudio
     stream: pyaudio.Stream
+    nchannels:int = 1
 
-    def __init__(self, source: "MicAudioSourcePyAudio"):
+    def __init__(self, source: "MicAudioSourcePyAudio", nchannels:int = 1):
         super().__init__(start_offset=0, diarization=None)
         self.source = source
         self.p: pyaudio.PyAudio = pyaudio.PyAudio()
         self.stream: pyaudio.Stream = self.p.open(
             format=pyaudio.paInt16,
-            channels=1,
+            channels=nchannels,
             rate=self.source.sampling_rate,
             input=True,
             frames_per_buffer=self.source.frames_per_buffer,
@@ -126,6 +132,9 @@ def close(self):
     def has_more(self):
         return True
 
+    def get_nchannels(self) -> int:
+        return self.nchannels
+
 
 class MicAudioSourcePyAudio(AudioSource):
     frames_per_iter: int

diff --git a/verbatim/audio/sources/pcmaudiosource.py b/verbatim/audio/sources/pcmaudiosource.py
@@ -45,6 +45,9 @@ def close(self):
     def has_more(self) -> bool:
         return self._has_more
 
+    def get_nchannels(self) -> int:
+        return self.source.channels
+
 
 class PCMInputStreamAudioSource(AudioSource):
     stream: BinaryIO

diff --git a/verbatim/audio/sources/wavsink.py b/verbatim/audio/sources/wavsink.py
@@ -6,7 +6,7 @@
 
 class WavSink:
     @staticmethod
-    def dump_to_wav(audio_source: AudioSource, output_path: str, sample_rate: int = 16000, preserve_stereo: bool = False):
+    def dump_to_wav(audio_source: AudioSource, output_path: str, sample_rate: int = 16000, preserve_channels: bool = False):
         """
         Dump the entire audio content from PyAVAudioSource to a .wav file.
         """
@@ -15,7 +15,7 @@ def dump_to_wav(audio_source: AudioSource, output_path: str, sample_rate: int =
             # pylint: disable=no-member
             with wave.open(output_path, "w") as wav_file:
                 # Set WAV parameters
-                num_channels = 2 if preserve_stereo else 1  # Stereo or mono
+                num_channels = audio_stream.get_nchannels() if preserve_channels else 1  # Stereo or mono
                 sample_width = 2  # 16-bit PCM
                 frame_rate = sample_rate  # Target sample rate
                 wav_file.setnchannels(num_channels)

diff --git a/verbatim/main.py b/verbatim/main.py
@@ -151,7 +151,9 @@ def __call__(self, parser, namespace, values, option_string=None):
     )
     parser.add_argument(
         "--separate",
-        action="store_true",
+        nargs="?",
+        action=OptionalValueAction,
+        default=None,
         help="Enables speaker voice separation and process each speaker separately",
     )
     parser.add_argument(
@@ -324,6 +326,7 @@ def __call__(self, parser, namespace, values, option_string=None):
     audio_sources: List[AudioSource] = []
     if args.separate:
         audio_sources += create_separate_speaker_sources(
+            strategy=args.separate or "pyannote",
             source_config=source_config,
             device=config.device,
             input_source=source_path,