Hi,
Thanks for taking a look at this. I got it fixed. I'll share for anyone looking at doing this. I'm using Python Flask on the backend and JavaScript on the frontend. The goal is to play the audio of the synthesized speech from text and a voice. I was focusing on wav after conversations with copilot. I found this approach on StackOverflow: https://stackoverflow.com/questions/77771568/cant-stream-azure-tts-from-server-to-client-side-using-a-pushstream-response-o
Becky
Frontend:
// Fetch TTS from Backend
const synthesizeSpeech = async (text, voice) => {
console.log(text, voice);
params = {
text: "Ciao",
voice: "it-IT-ElsaNeural",
};
const urlWithParams = `/synthesize?text=${text}&voice=${voice}`;
try {
const response = await fetch(urlWithParams);
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const audioBuffer = await response.arrayBuffer();
if (audioBuffer.byteLength > 0) {
const audioBlob = new Blob([audioBuffer], { type: "audio/mpeg" });
const audioUrl = URL.createObjectURL(audioBlob);
const audio = new Audio(audioUrl);
// Play the fetched audio
audio.play();
console.log("Fetched audio response:", audioBlob); // Print the audio response
return audioBlob;
} else {
console.error("Empty audio response");
return null;
}
} catch (error) {
console.error("Error fetching text-to-speech audio:", error);
return null;
}
};
Backend:
def generate_speech_from_text(text,voice):
speech_config = speechsdk.SpeechConfig(
subscription=speechKey,
region=region
)
speech_config.speech_synthesis_voice_name = voice # "en-US-JennyNeural"
speech_config.set_speech_synthesis_output_format(
speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
)
synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
result = synthesizer.speak_text_async(text).get()
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
return BytesIO(result.audio_data)
else:
raise Exception(f"Speech synthesis failed: {result.error_details}")
@app.route("/synthesize", methods=["GET"])
def synthesize():
print("in textToSpeech")
text_works = request.args.get('text')
voice = request.args.get('voice')
if text_works:
try:
audio_stream = generate_speech_from_text(text_works,voice)
return Response(audio_stream.getvalue(), mimetype="audio/mpeg")
except Exception as err:
print(err)
return "Error in text-to-speech synthesis", 500
else:
return "Text not provided", 404