Merge pull request #9 from ahmetoner/language-detection

add language detection endpoint
ahmetoner · Sep 25, 2022 · 1a7c532 · 1a7c532
2 parents ae5ebe6 + be2cd8a
commit 1a7c532
Show file tree

Hide file tree

Showing 3 changed files with 62 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -29,7 +29,28 @@ poetry run whisper_asr
 
 After running the docker image or ``poetry run whisper_asr`` interactive Swagger API documentation is available at [localhost:9000/docs](http://localhost:9000/docs)
 
-Simply upload your sound file and choose either **translate** or **transcribe**. Optionally you can provide the language of the input file, otherwise it will be automatically detected.
+There are two endpoints available: 
+- /asr
+-  /detect-language
+
+## Automatic Speech recognition service /asr
+
+If you choose the **transcribe** task, transcribes the uploaded sound file. 
+You can provide the language or it will be automatically recognized. 
+If you choose the **translate** task it will provide an English transcript no matter which language was spoken.
+
+Returns a json with following fields:
+- **text** : Contains the full transcript
+- **segments** : Contains an entry per segment. Each entry  provides time stamps, transcript, token ids and other metadata
+- **language**: detected or provided language (as a language code)
+
+## Language detection service /detect-language
+
+Detects the language spoken in the uploaded sound file. For longer files it only processes first 30 seconds.
+
+Returns a json with following fields:
+-  **detected_language**
+- **langauge_code**
 
 
 
@@ -69,3 +90,4 @@ docker run -d -p 9000:9000 -e ASR_MODEL=base whisper-asr-webservice
 * Github pipeline
 * Unit tests
 * CUDA version of Docker image
+* Hosted Swagger documentation with descriptions 
diff --git a/src/whisper_asr/languages.py b/src/whisper_asr/languages.py
@@ -99,4 +99,20 @@
     "ba": "bashkir",
     "jw": "javanese",
     "su": "sundanese",
+}
+
+# language code lookup by name, with a few language aliases
+TO_LANGUAGE_CODE = {
+    **{language: code for code, language in LANGUAGES.items()},
+    "burmese": "my",
+    "valencian": "ca",
+    "flemish": "nl",
+    "haitian": "ht",
+    "letzeburgesch": "lb",
+    "pushto": "ps",
+    "panjabi": "pa",
+    "moldavian": "ro",
+    "moldovan": "ro",
+    "sinhalese": "si",
+    "castilian": "es",
 }
diff --git a/src/whisper_asr/webservice.py b/src/whisper_asr/webservice.py
@@ -4,7 +4,7 @@
 import os
 import ffmpeg
 from typing import BinaryIO, Union
-from .languages import LANGUAGES
+from .languages import LANGUAGES, TO_LANGUAGE_CODE
 import numpy as np
 
 SAMPLE_RATE=16000
@@ -15,8 +15,6 @@
 model_name= os.getenv("ASR_MODEL", "base")
 model = whisper.load_model(model_name)
 
-
-
 @app.post("/asr")
 def transcribe_file(
                 audio_file: UploadFile = File(...),
@@ -34,6 +32,28 @@ def transcribe_file(
 
     return result
 
+@app.post("/detect-language")
+def language_detection(
+                audio_file: UploadFile = File(...),
+                ):
+
+
+    # load audio and pad/trim it to fit 30 seconds
+    audio = load_audio(audio_file.file)
+    audio = whisper.pad_or_trim(audio)
+
+    # make log-Mel spectrogram and move to the same device as the model
+    mel = whisper.log_mel_spectrogram(audio).to(model.device)
+
+    # detect the spoken language
+    _, probs = model.detect_language(mel)
+    detected_lang_code = max(probs, key=probs.get)
+
+    result = { "detected_language": LANGUAGES[detected_lang_code],
+              "langauge_code" : detected_lang_code }
+
+    return result
+
 def load_audio(file: BinaryIO, sr: int = SAMPLE_RATE):
     """
     Open an audio file object and read as mono waveform, resampling as necessary.