From be2cd8af53a57970021f36604a1d961f3305b196 Mon Sep 17 00:00:00 2001
From: Besim Alibegovic <Besim.Alibegovic@iais.fraunhofer.de>
Date: Sun, 25 Sep 2022 15:53:50 +0200
Subject: [PATCH] add language detection endpoint

---
 README.md                     | 24 +++++++++++++++++++++++-
 src/whisper_asr/languages.py  | 16 ++++++++++++++++
 src/whisper_asr/webservice.py | 26 +++++++++++++++++++++++---
 3 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 7f4b112..cfa9e01 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,28 @@ poetry run whisper_asr
 
 After running the docker image or ``poetry run whisper_asr`` interactive Swagger API documentation is available at [localhost:9000/docs](http://localhost:9000/docs)
 
-Simply upload your sound file and choose either **translate** or **transcribe**. Optionally you can provide the language of the input file, otherwise it will be automatically detected.
+There are two endpoints available: 
+- /asr
+-  /detect-language
+
+## Automatic Speech recognition service /asr
+
+If you choose the **transcribe** task, transcribes the uploaded sound file. 
+You can provide the language or it will be automatically recognized. 
+If you choose the **translate** task it will provide an English transcript no matter which language was spoken.
+
+Returns a json with following fields:
+- **text** : Contains the full transcript
+- **segments** : Contains an entry per segment. Each entry  provides time stamps, transcript, token ids and other metadata
+- **language**: detected or provided language (as a language code)
+
+## Language detection service /detect-language
+
+Detects the language spoken in the uploaded sound file. For longer files it only processes first 30 seconds.
+
+Returns a json with following fields:
+-  **detected_language**
+- **langauge_code**
 
 
 
@@ -69,3 +90,4 @@ docker run -d -p 9000:9000 -e ASR_MODEL=base whisper-asr-webservice
 * Github pipeline
 * Unit tests
 * CUDA version of Docker image
+* Hosted Swagger documentation with descriptions 
diff --git a/src/whisper_asr/languages.py b/src/whisper_asr/languages.py
index 96e1545..25639b0 100644
--- a/src/whisper_asr/languages.py
+++ b/src/whisper_asr/languages.py
@@ -99,4 +99,20 @@
     "ba": "bashkir",
     "jw": "javanese",
     "su": "sundanese",
+}
+
+# language code lookup by name, with a few language aliases
+TO_LANGUAGE_CODE = {
+    **{language: code for code, language in LANGUAGES.items()},
+    "burmese": "my",
+    "valencian": "ca",
+    "flemish": "nl",
+    "haitian": "ht",
+    "letzeburgesch": "lb",
+    "pushto": "ps",
+    "panjabi": "pa",
+    "moldavian": "ro",
+    "moldovan": "ro",
+    "sinhalese": "si",
+    "castilian": "es",
 }
\ No newline at end of file
diff --git a/src/whisper_asr/webservice.py b/src/whisper_asr/webservice.py
index 51f9882..7588d7e 100644
--- a/src/whisper_asr/webservice.py
+++ b/src/whisper_asr/webservice.py
@@ -4,7 +4,7 @@
 import os
 import ffmpeg
 from typing import BinaryIO, Union
-from .languages import LANGUAGES
+from .languages import LANGUAGES, TO_LANGUAGE_CODE
 import numpy as np
 
 SAMPLE_RATE=16000
@@ -15,8 +15,6 @@
 model_name= os.getenv("ASR_MODEL", "base")
 model = whisper.load_model(model_name)
 
-
-
 @app.post("/asr")
 def transcribe_file(
                 audio_file: UploadFile = File(...),
@@ -34,6 +32,28 @@ def transcribe_file(
 
     return result
 
+@app.post("/detect-language")
+def language_detection(
+                audio_file: UploadFile = File(...),
+                ):
+
+
+    # load audio and pad/trim it to fit 30 seconds
+    audio = load_audio(audio_file.file)
+    audio = whisper.pad_or_trim(audio)
+
+    # make log-Mel spectrogram and move to the same device as the model
+    mel = whisper.log_mel_spectrogram(audio).to(model.device)
+
+    # detect the spoken language
+    _, probs = model.detect_language(mel)
+    detected_lang_code = max(probs, key=probs.get)
+    
+    result = { "detected_language": LANGUAGES[detected_lang_code],
+              "langauge_code" : detected_lang_code }
+
+    return result
+
 def load_audio(file: BinaryIO, sr: int = SAMPLE_RATE):
     """
     Open an audio file object and read as mono waveform, resampling as necessary.