From be2cd8af53a57970021f36604a1d961f3305b196 Mon Sep 17 00:00:00 2001 From: Besim Alibegovic Date: Sun, 25 Sep 2022 15:53:50 +0200 Subject: [PATCH] add language detection endpoint --- README.md | 24 +++++++++++++++++++++++- src/whisper_asr/languages.py | 16 ++++++++++++++++ src/whisper_asr/webservice.py | 26 +++++++++++++++++++++++--- 3 files changed, 62 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 7f4b112..cfa9e01 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,28 @@ poetry run whisper_asr After running the docker image or ``poetry run whisper_asr`` interactive Swagger API documentation is available at [localhost:9000/docs](http://localhost:9000/docs) -Simply upload your sound file and choose either **translate** or **transcribe**. Optionally you can provide the language of the input file, otherwise it will be automatically detected. +There are two endpoints available: +- /asr +- /detect-language + +## Automatic Speech recognition service /asr + +If you choose the **transcribe** task, transcribes the uploaded sound file. +You can provide the language or it will be automatically recognized. +If you choose the **translate** task it will provide an English transcript no matter which language was spoken. + +Returns a json with following fields: +- **text** : Contains the full transcript +- **segments** : Contains an entry per segment. Each entry provides time stamps, transcript, token ids and other metadata +- **language**: detected or provided language (as a language code) + +## Language detection service /detect-language + +Detects the language spoken in the uploaded sound file. For longer files it only processes first 30 seconds. + +Returns a json with following fields: +- **detected_language** +- **langauge_code** @@ -69,3 +90,4 @@ docker run -d -p 9000:9000 -e ASR_MODEL=base whisper-asr-webservice * Github pipeline * Unit tests * CUDA version of Docker image +* Hosted Swagger documentation with descriptions diff --git a/src/whisper_asr/languages.py b/src/whisper_asr/languages.py index 96e1545..25639b0 100644 --- a/src/whisper_asr/languages.py +++ b/src/whisper_asr/languages.py @@ -99,4 +99,20 @@ "ba": "bashkir", "jw": "javanese", "su": "sundanese", +} + +# language code lookup by name, with a few language aliases +TO_LANGUAGE_CODE = { + **{language: code for code, language in LANGUAGES.items()}, + "burmese": "my", + "valencian": "ca", + "flemish": "nl", + "haitian": "ht", + "letzeburgesch": "lb", + "pushto": "ps", + "panjabi": "pa", + "moldavian": "ro", + "moldovan": "ro", + "sinhalese": "si", + "castilian": "es", } \ No newline at end of file diff --git a/src/whisper_asr/webservice.py b/src/whisper_asr/webservice.py index 51f9882..7588d7e 100644 --- a/src/whisper_asr/webservice.py +++ b/src/whisper_asr/webservice.py @@ -4,7 +4,7 @@ import os import ffmpeg from typing import BinaryIO, Union -from .languages import LANGUAGES +from .languages import LANGUAGES, TO_LANGUAGE_CODE import numpy as np SAMPLE_RATE=16000 @@ -15,8 +15,6 @@ model_name= os.getenv("ASR_MODEL", "base") model = whisper.load_model(model_name) - - @app.post("/asr") def transcribe_file( audio_file: UploadFile = File(...), @@ -34,6 +32,28 @@ def transcribe_file( return result +@app.post("/detect-language") +def language_detection( + audio_file: UploadFile = File(...), + ): + + + # load audio and pad/trim it to fit 30 seconds + audio = load_audio(audio_file.file) + audio = whisper.pad_or_trim(audio) + + # make log-Mel spectrogram and move to the same device as the model + mel = whisper.log_mel_spectrogram(audio).to(model.device) + + # detect the spoken language + _, probs = model.detect_language(mel) + detected_lang_code = max(probs, key=probs.get) + + result = { "detected_language": LANGUAGES[detected_lang_code], + "langauge_code" : detected_lang_code } + + return result + def load_audio(file: BinaryIO, sr: int = SAMPLE_RATE): """ Open an audio file object and read as mono waveform, resampling as necessary.