v1.0.6 adding custom voice support, in response to #13

jeroenterheerdt · Dec 8, 2020 · 5befc89 · 5befc89
1 parent 6267feb
commit 5befc89
Show file tree

Hide file tree

Showing 4 changed files with 70 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -16,5 +16,12 @@ with open("file2.wav", "wb") as f:
         f.write(data)
 ```
 
+You can also use custom voice by specifying `isCustom=True` and providing a `customEndpoint`:
+```python
+from pycsspeechtts import TTSTranslator
+t = TTSTranslator("YOUR API KEY","westeurope", isCustom=True, customEndpoint=MyEndpoint)
+data = t.speak(language='en-gb',gender='Male',voiceType="ArchieNeural",text="This is a test for custom voice")
+```
+
 See test.py for more samples.
 Refer to https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support to find the valid values for language, gender, voicetype and output formats.
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 with open("README.md", "r") as fh:
     long_description = fh.read()
 setup(name='pycsspeechtts',
-      version='1.0.5',
+      version='1.0.6',
       description='Python 3 interface to Microsoft Cognitive Services Text To Speech',
       long_description=long_description,
       long_description_content_type="text/markdown",

diff --git a/src/pycsspeechtts/pycsspeechtts.py b/src/pycsspeechtts/pycsspeechtts.py
@@ -17,16 +17,20 @@ class TTSTranslator(object):
     Interface class for the Microsoft Cognitive Services Text-to-speech translator
     """
 
-    def __init__(self, apiKey, region="eastus"):
+    def __init__(self, apiKey, region="eastus", isCustom=False, customEndpoint=None):
         self._apiKey = apiKey
         self._geoLocation = region
+        self._isCustom = isCustom
+        self._customEndpoint=customEndpoint
 
         headers = {"Ocp-Apim-Subscription-Key": self._apiKey}
-        response = requests.post(AccessTokenUrlTemplate.format(
-            self._geoLocation), headers=headers)
-        response.raise_for_status()
+        if not self._isCustom:
+            response = requests.post(AccessTokenUrlTemplate.format(
+                self._geoLocation), headers=headers)
+            response.raise_for_status()
+            self._accesstoken = str(response.text)
         _LOGGER.debug("Connection Initialized OK")
-        self._accesstoken = str(response.text)
+
 
     def speak(self, language="en-us", gender="Female", voiceType="JessaNeural",
               output="riff-24khz-16bit-mono-pcm", rate="+0.00%", volume="+0.00%",
@@ -44,23 +48,35 @@ def name_lang(language):
         voice.set(
             'name', 'Microsoft Server Speech Text to Speech Voice ('+name_lang(language)+', '+voiceType+')')
 
-        prosody = ElementTree.SubElement(voice, 'prosody')
-        prosody.set('rate', rate)
-        prosody.set('volume', volume)
-        prosody.set('pitch', pitch)
-        prosody.set('contour', contour)
-        prosody.text = text
-
-        headers = {"Content-Type": "application/ssml+xml",
-                   "X-Microsoft-OutputFormat": output,
-                   "Authorization": "Bearer " + self._accesstoken,
-                   "X-Search-AppId": "07D3234E49CE426DAA29772419F436CA",
-                   "X-Search-ClientID": "1ECFAE91408841A480F00935DC390960",
-                   "User-Agent": "PYCSSpeechTTS"
-                   }
+        endpoint = None
+        if self._isCustom:
+            # this is a custom voice
+            endpoint = self._customEndpoint
+            headers = {"Content-Type": "application/ssml+xml",
+                "X-Microsoft-OutputFormat": output,
+                "Ocp-Apim-Subscription-Key": self._apiKey,
+                "User-Agent": "PYCSSpeechTTS"
+                }
+            voice.text = text
+        else: 
+            # not a custom voice, generate the endpoint
+            endpoint = SpeechUrlTemplate.format(self._geoLocation)
+            headers = {"Content-Type": "application/ssml+xml",
+                "X-Microsoft-OutputFormat": output,
+                "Authorization": "Bearer " + self._accesstoken,
+                "X-Search-AppId": "07D3234E49CE426DAA29772419F436CA",
+                "X-Search-ClientID": "1ECFAE91408841A480F00935DC390960",
+                "User-Agent": "PYCSSpeechTTS"
+                }
+            prosody = ElementTree.SubElement(voice, 'prosody')
+            prosody.set('rate', rate)
+            prosody.set('volume', volume)
+            prosody.set('pitch', pitch)
+            prosody.set('contour', contour)
+            prosody.text = text
 
         response = requests.post(
-            SpeechUrlTemplate.format(self._geoLocation), ElementTree.tostring(body), headers=headers)
+            endpoint, ElementTree.tostring(body), headers=headers)
         if response.status_code == requests.codes.ok:
             _LOGGER.debug("Text synthesis OK")
             return response.content

diff --git a/src/pycsspeechtts/test.py b/src/pycsspeechtts/test.py
@@ -1,26 +1,33 @@
 from pycsspeechtts import TTSTranslator
-t = TTSTranslator("YOUR_API_KEY")
-
-# Speaking with default language of english US and default Female voice
-data = t.speak(text='This is a test')
-# Change speed with -50%
-data = t.speak(text="This is a test", rate="-50%")
-# Change pitch to high
-data = t.speak(text="This is a test", pitch="high")
-# Change volume to +20%
-data = t.speak(text="This is a test", volume="+20%")
-# See https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support for the accepted values of the parameters below
-data = t.speak('en-GB', 'Male', 'George, Apollo',
-               'riff-16khz-16bit-mono-pcm', text='I am Max')
-data = t.speak('cs-CZ', 'Male', 'Jakub', text='Pojďme vyzkoušet klasickou českou testovací větu. Příliš žluťoučký kůň úpěl ďábelské ódy.')
-# Using contour to change pitch from normal at 0% of speech and +100% at 100% of speech
-data = t.speak(text="The Wall Street Journal - which says it's spoken to people close to the ongoing investigation - says the information it has paints a picture of a catastrophic failure that quickly overwhelmed the flight crew",
-               contour="(0%,+0%) (100%,+100%)")
+useCustom = True
+api_key = "YOUR_API_KEY"
+custom_endpoint = "custom_endpoint"
 
+if not useCustom:
+    t = TTSTranslator(api_key, region="westus")
+    # Speaking with default language of english US and default Female voice
+    data = t.speak(text='This is a test')
+    # Change speed with -50%
+    data = t.speak(text="This is a test", rate="-50%")
+    # Change pitch to high
+    data = t.speak(text="This is a test", pitch="high")
+    # Change volume to +20%
+    data = t.speak(text="This is a test", volume="+20%")
+    # See https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support for the accepted values of the parameters below
+    data = t.speak('en-GB', 'Male', 'George, Apollo',
+                'riff-16khz-16bit-mono-pcm', text='I am Max')
+    data = t.speak('cs-CZ', 'Male', 'Jakub', text='Pojďme vyzkoušet klasickou českou testovací větu. Příliš žluťoučký kůň úpěl ďábelské ódy.')
+    # Using contour to change pitch from normal at 0% of speech and +100% at 100% of speech
+    data = t.speak(text="The Wall Street Journal - which says it's spoken to people close to the ongoing investigation - says the information it has paints a picture of a catastrophic failure that quickly overwhelmed the flight crew",
+                contour="(0%,+0%) (100%,+100%)")
+else:
+    # Test custom voice
+    t = TTSTranslator(api_key, region="westus", isCustom=True, customEndpoint=custom_endpoint)
+    data = t.speak(language='en-gb',gender='Male',voiceType="ArchieNeural",text="This is a test for custom voice")
 
 if data == None:
-    print("An error occured")
+    print("An error occurred")
 else:
     with open("file.wav", "wb") as f:
         f.write(data)
-    print("Succes! Open file.wav to hear the results")
+    print("Success! Open file.wav to hear the results")