Hi All,
I use the TranslationRecognizer from the Speech SDK to do transcription and translation. I also use the synthesizing to change the language. This is how the recognizer ist created:
def create_translation_recognizer(self):
speech_translation_config = speechsdk.translation.SpeechTranslationConfig(subscription=os.getenv('AZURE_SPEECH_KEY'), region=os.getenv('AZURE_SPEECH_REGION'))
speech_translation_config.set_property(speechsdk.PropertyId.Speech_SegmentationStrategy, "Semantic")
speech_translation_config.speech_recognition_language=self.source_language
speech_translation_config.add_target_language(self.target_language)
# See voices in azure speech studio: https://speech.microsoft.com/portal/voicegallery
speech_translation_config.voice_name = "en-US-AdamMultilingualNeural"
speech_translation_config.request_word_level_timestamps()
speech_translation_config.output_format = speechsdk.OutputFormat.Detailed
self.push_stream = speechsdk.audio.PushAudioInputStream()
audio_config = speechsdk.audio.AudioConfig(stream=self.push_stream)
self.translation_recognizer = speechsdk.translation.TranslationRecognizer(translation_config=speech_translation_config, audio_config=audio_config)
self.translation_recognizer.recognizing.connect(self._recognizing)
self.translation_recognizer.recognized.connect(self._recognized)
self.translation_recognizer.canceled.connect(self._canceled)
self.translation_recognizer.session_stopped.connect(self._session_stopped)
self.translation_recognizer.synthesizing.connect(self._synthesis_callback)
self.translation_recognizer.start_continuous_recognition_async()
I receive the transcribed and translated text as well as the synthesized audio without issues.
After every transcription I check if the target language has changed:
def _synthesis_callback(self, evt):
print("synth")
#print("synthoffset",evt.result.offset)
if evt.result.reason == speechsdk.ResultReason.SynthesizingAudio:
size = len(evt.result.audio)
print(f'Audio synthesized: {size} byte(s) {"(COMPLETED)" if size == 0 else ""}')
if size > 0:
self.audio_out(evt.result.audio)
def _recognizing(self, evt):
if evt.result.reason == speechsdk.ResultReason.TranslatingSpeech:
#print(f'Recognizing: "{evt.result.translations[self.target_language]}"')
self.activity_out(evt.result.translations[self.target_language]) # for future use
def _recognized(self, evt):
print("offset",evt.result.offset/10**7)
if evt.result.reason == speechsdk.ResultReason.TranslatedSpeech:
if evt.result.text == "":
return
print("\nIncoming Translation:\n",evt.result.translations[self.target_language])
self.queue.append({"text": evt.result.text, "translation" : evt.result.translations[self.target_language]})
elif evt.result.reason == speechsdk.ResultReason.NoMatch:
print('No speech could be translated.')
self.check_language()
def check_language(self):
with open("language_config.txt","r") as f:
target_lang = f.read().strip().splitlines()[1][:2]
print("targetlang",target_lang)
if target_lang != self.target_language:
print("trying to change")
self.translation_recognizer.add_target_language(target_lang)
self.translation_recognizer.remove_target_language(self.target_language)
When the language has changed I'll add the new language and remove the previous language. This also works fine with the transcribed and translated text being output. However as soon as I do this the callback for synthesizing is not triggered anymore. The only way so far I found to work around is to completely re-create the TranslationRecognizer, however this will kill anything that is currently loaded in it. Do I need to re-initialize the synthesizer somehow?