Error when returning audio stream from server using speech synthesis

Question

I was able to generate and produce audio speech on my local server. The API was generating a wav file and storing it in my local file system. The speech was played by an HTML audio tag picking up the local file.

When I deployed to the Azure web service, the app stopped speaking. I've done some debugging with copilot to create a wav audio stream. However, the format has a problem. I've been looking at the initial bits in the wav file and they indicate this is not a wav file.

The code I'm using is below.

Again, I need a way to produce audio and send it back to a browser. The code I'm using in the browser to play the audio is below. It produces a not supported error.

function playAudioFromResponse(response) {
    console.log('in play audio');
    const blob = new Blob([response], { type: 'audio/wav' });
    const url = URL.createObjectURL(blob);
    console.log(url)
    const audio = new Audio(url);
    console.log(audio)
    audio.play().catch(error => console.error('Error playing audio:', error));}

class InMemoryStream(speechsdk.audio.PushAudioOutputStreamCallback):
    def __init__(self):
        super().__init__()
        self._audio_data = io.BytesIO()

    def write(self, buffer: memoryview) -> int:
        self._audio_data.write(buffer)
        return len(buffer)

    def close(self):
        self._audio_data.seek(0)
        return self._audio_data

def generate_speech(text, language, voice_name):
    try:
        speech_config = speechsdk.SpeechConfig(subscription=speechKey, region=region)
        speech_config.speech_synthesis_language = language
        speech_config.speech_synthesis_voice_name = voice_name

         # Specify the output format as RIFF PCM (wav) (suggestion from copilot)
        		speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm)

        # Create an in-memory stream for audio output
        audio_output_callback = InMemoryStream()
        audio_output_stream = speechsdk.audio.PushAudioOutputStream(audio_output_callback)
        audio_config = speechsdk.audio.AudioOutputConfig(stream=audio_output_stream)
        
        synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
        
        result = synthesizer.speak_text_async(text).get()
        
        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            print("Speech synthesized successfully.")
            audio_data = audio_output_callback.close()
            print("after audio output close")
            audio_data.seek(0)
            # Log the first few bytes to check the format
            print("Audio data (first 32 bytes):", audio_data.read(32))
            audio_data.seek(0)
            return audio_data
        elif result.reason == speechsdk.ResultReason.Canceled:
            cancellation_details = result.cancellation_details
            print("Speech synthesis canceled: {}".format(cancellation_details.reason))
            print("Error details: {}".format(cancellation_details.error_details))
            raise Exception("Speech synthesis canceled: {}".format(cancellation_details.error_details))
        else:
            print("Error synthesizing speech: {}".format(result.reason))
            raise Exception("Speech synthesis failed with reason: {}".format(result.reason))
    except Exception as e:
        print(f"Error in generate_speech: {e}")
        raise

Answer

Hi,

Thanks for taking a look at this. I got it fixed. I'll share for anyone looking at doing this. I'm using Python Flask on the backend and JavaScript on the frontend. The goal is to play the audio of the synthesized speech from text and a voice. I was focusing on wav after conversations with copilot. I found this approach on StackOverflow: https://stackoverflow.com/questions/77771568/cant-stream-azure-tts-from-server-to-client-side-using-a-pushstream-response-o

Becky

Frontend:

// Fetch TTS from Backend
const synthesizeSpeech = async (text, voice) => {
  console.log(text, voice);
  params = {
    text: "Ciao",
    voice: "it-IT-ElsaNeural",
  };
  const urlWithParams = `/synthesize?text=${text}&voice=${voice}`;
  try {
    const response = await fetch(urlWithParams);

    if (!response.ok) {
      throw new Error(`HTTP error! status: ${response.status}`);
    }

    const audioBuffer = await response.arrayBuffer();

    if (audioBuffer.byteLength > 0) {
      const audioBlob = new Blob([audioBuffer], { type: "audio/mpeg" });
      const audioUrl = URL.createObjectURL(audioBlob);
      const audio = new Audio(audioUrl);

      // Play the fetched audio
      audio.play();

      console.log("Fetched audio response:", audioBlob); // Print the audio response

      return audioBlob;
    } else {
      console.error("Empty audio response");
      return null;
    }
  } catch (error) {
    console.error("Error fetching text-to-speech audio:", error);
    return null;
  }
};

Backend:



def generate_speech_from_text(text,voice):
    speech_config = speechsdk.SpeechConfig(
        subscription=speechKey,
        region=region
    )
    speech_config.speech_synthesis_voice_name = voice # "en-US-JennyNeural"
    speech_config.set_speech_synthesis_output_format(
        speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
    )
    
    synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
    result = synthesizer.speak_text_async(text).get()
    
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        return BytesIO(result.audio_data)
    else:
        raise Exception(f"Speech synthesis failed: {result.error_details}")

@app.route("/synthesize", methods=["GET"])
def synthesize():
    print("in textToSpeech")
    text_works = request.args.get('text')
    voice = request.args.get('voice')
    if text_works:
        try:
            audio_stream = generate_speech_from_text(text_works,voice)
            return Response(audio_stream.getvalue(), mimetype="audio/mpeg")
        except Exception as err:
            print(err)
            return "Error in text-to-speech synthesis", 500
    else:
        return "Text not provided", 404

Answer

Hi Rebecca,

It sounds like the issue lies in the format of the audio stream being returned from your Azure web service. The error indicates that the browser is not recognizing the stream as a valid WAV file. Here are a few steps to troubleshoot and resolve this:

Verify the WAV Header:

Ensure the audio stream includes a valid WAV header. The first 44 bytes of a WAV file contain the header, which specifies the format. If this is missing or incorrect, the browser won’t recognize it as a valid WAV file.
You can manually add a WAV header to the stream if necessary.

Check the Output Format:

Confirm that the speechsdk.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm format is being correctly applied. This should produce a valid WAV file.
Log the first few bytes of the audio stream to verify the format: python Copy
```
  print("Audio data (first 32 bytes):",
```

Return the Correct MIME Type:
- Ensure your server is returning the correct MIME type (audio/wav) in the response headers.
Browser-Side Debugging:
- Use the browser’s developer tools to inspect the network response and verify the content type and size of the audio stream.
- Check for errors in the console when attempting to play the audio.
Alternative Approach:
- Instead of using an in-memory stream, consider saving the audio to a temporary file on the server and returning the file URL to the browser. This can help isolate whether the issue is with the stream or the browser’s handling of it.

try that modified Python code to ensure the WAV header is included

import io  
import struct  

class InMemoryStream(speechsdk.audio.PushAudioOutputStreamCallback):  
    def __init__(self):  
        super().__init__()  
        self._audio_data = io.BytesIO()  

    def write(self, buffer: memoryview) -> int:  
        self._audio_data.write(buffer)  
        return len(buffer)  

    def close(self):  
        self._audio_data.seek(0)  
        return self._audio_data  

def generate_speech(text, language, voice_name):  
    try:  
        speech_config = speechsdk.SpeechConfig(subscription=speechKey, region=region)  
        speech_config.speech_synthesis_language = language  
        speech_config.speech_synthesis_voice_name = voice_name  
        speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm)  

        audio_output_callback = InMemoryStream()  
        audio_output_stream = speechsdk.audio.PushAudioOutputStream(audio_output_callback)  
        audio_config = speechsdk.audio.AudioOutputConfig(stream=audio_output_stream)  

        synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)  
        result = synthesizer.speak_text_async(text).get()  

        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:  
            print("Speech synthesized successfully.")  
            audio_data = audio_output_callback.close()  
            audio_data.seek(0)  
            return audio_data  
        else:  
            raise Exception("Speech synthesis failed.")  
    except Exception as e:  
        print(f"Error in generate_speech: {e}")  
        raise

For more details, refer to the official documentation:

Azure AI Speech SDK

Speech Synthesis Output Formats

Let me know if you need further assistance!

rgds,

Alex.

p.s. Kindly consider accepting the answer if the information provided is helpful. This can assist other community members in resolving similar issues.

Answer

I've reworked my Speech Synthesizer code following examples from Microsoft docs. In this version, I am encoding the audio data with base64 encoding and returning it in an audio/wav mime type.. Again this works perfectly on my local machine. I have other Speech APIs that work both locally and on the server, but I continue to have problems with the speech synthesizer on the server.

I'd appreciate any help in figuring out why this wont run on the Azure server. It works well locally. I'm also running some Speech Recognition and Translation APIs successfully on Azure in the same Application.

Here is my backend code:

def generate_speech_from_text(text,voice):
    speech_config = speechsdk.SpeechConfig(
        subscription=speechKey,
        region=region
    )
    audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)
    speech_config.speech_synthesis_voice_name = "en-US-AvaMultilingualNeural" #voice
 
    
    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config,audio_config=audio_config)
    speech_synthesis_result = speech_synthesizer.speak_text_async(text).get()
    # print("speech_synthesis_result:",speech_synthesis_result)
    if speech_synthesis_result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        # return BytesIO(result.audio_data)
        # print("audio data:",speech_synthesis_result.audio_data)
        return speech_synthesis_result.audio_data

    elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled:
        print("Speech synthesis canceled: {}".format(cancellation_details.reason))
        cancellation_details = speech_synthesis_result.cancellation_details
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(cancellation_details.error_details))
        return None      
    else:
        print("Unexpected speech synthesis result: {}".format(speech_synthesis_result.reason))
        return None
    # else:
    #     raise Exception(f"Speech synthesis failed: {result.error_details}")

@app.route("/synthesize", methods=["GET"])
def synthesize():
    # print("in textToSpeech")
    text_to_speak = request.args.get('text')
    voice = request.args.get('voice')
    print(text_to_speak,voice)
    if not text_to_speak:
        return jsonify({"error": "Missing 'text' parameter"}), 400
    # voice = request.args.get('voice')
    audio_data = generate_speech_from_text(text_to_speak,voice)
    # print('audio_data: ',audio_data)
    if audio_data:
        # Convert to base64 for embedding in JSON (or other methods).
        import base64
        audio_base64 = base64.b64encode(audio_data).decode('utf-8')

        return jsonify({"audioData": audio_base64, "contentType":"audio/wav"}) #Return the base64 encoded data, and the content type.
    else:
        return jsonify({"error": "Speech synthesis failed"}), 500

Here is my front end code:

async function synthesizeSpeech (text, voice) {
  const urlWithParams = `/synthesize?text=${text}&voice=${voice}`;
  console.log(urlWithParams)
  try {
        const response = await fetch(urlWithParams);
        if (!response.ok) {
          throw new Error(`HTTP error! status: ${response.status}`);
        }
        const data = await response.json()
        const audio = new Audio(`data:${data.contentType};base64,${data.audioData}`);
        // audio.play()
  } catch (error) {
    console.error("Error fetching text-to-speech audio:", error);
    return null;
  }
}

Answer

Hi Rebecca Peltz

I'm glad that you were able to resolve your issue and thank you for posting your solution so that others experiencing the same thing can easily reference this! Since the Microsoft Q&A community has a policy that "The question author cannot accept their own answer. They can only accept answers by others ", I'll repost your solution in case you'd like to accept the answer.

Ask: Error when returning audio stream from server using speech synthesis

Solution: The issue is resolved. That you got it fixed. I'll share for anyone looking at doing this. That you are using Python Flask on the backend and JavaScript on the frontend. The goal is to play the audio of the synthesized speech from text and a voice. You were focusing on wav after conversations with copilot. I found this approach on Stack Overflow: https://stackoverflow.com/questions/77771568/cant-stream-azure-tts-from-server-to-client-side-using-a-pushstream-response-o

Becky

Frontend:

// Fetch TTS from Backend
const synthesizeSpeech = async (text, voice) => {
  console.log(text, voice);
  params = {
    text: "Ciao",
    voice: "it-IT-ElsaNeural",
  };
  const urlWithParams = `/synthesize?text=${text}&voice=${voice}`;
  try {
    const response = await fetch(urlWithParams);
    if (!response.ok) {
      throw new Error(`HTTP error! status: ${response.status}`);
    }
    const audioBuffer = await response.arrayBuffer();
    if (audioBuffer.byteLength > 0) {
      const audioBlob = new Blob([audioBuffer], { type: "audio/mpeg" });
      const audioUrl = URL.createObjectURL(audioBlob);
      const audio = new Audio(audioUrl);
      // Play the fetched audio
      audio.play();
      console.log("Fetched audio response:", audioBlob); // Print the audio response
      return audioBlob;
    } else {
      console.error("Empty audio response");
      return null;
    }
  } catch (error) {
    console.error("Error fetching text-to-speech audio:", error);
    return null;
  }
};

Backend:



def generate_speech_from_text(text,voice):
    speech_config = speechsdk.SpeechConfig(
        subscription=speechKey,
        region=region
    )
    speech_config.speech_synthesis_voice_name = voice # "en-US-JennyNeural"
    speech_config.set_speech_synthesis_output_format(
        speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
    )
    
    synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
    result = synthesizer.speak_text_async(text).get()
    
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        return BytesIO(result.audio_data)
    else:
        raise Exception(f"Speech synthesis failed: {result.error_details}")

@app.route("/synthesize", methods=["GET"])
def synthesize():
    print("in textToSpeech")
    text_works = request.args.get('text')
    voice = request.args.get('voice')
    if text_works:
        try:
            audio_stream = generate_speech_from_text(text_works,voice)
            return Response(audio_stream.getvalue(), mimetype="audio/mpeg")
        except Exception as err:
            print(err)
            return "Error in text-to-speech synthesis", 500
    else:
        return "Text not provided", 404

If I missed anything please let me know and I'd be happy to add it to my answer, or feel free to comment below with any additional information.

If you have any other questions, please let me know. Thank you again for your time and patience throughout this issue.

Please don’t forget to Accept Answer and Yes for "was this answer helpful" wherever the information provided helps you, this can be beneficial to other community members.

Thank You.

Share via

Error when returning audio stream from server using speech synthesis

4 answers

Your answer