Note
Please see Azure Cognitive Services for Speech documentation for the latest supported speech solutions.
Microsoft Speech Platform
Persist Recognized Audio to WAV Files
Applications can persist or store the audio that was recognized by a speech recognition (SR) engine to a WAV file. This topic presents the following information about persisting recognized speech to WAV files:
- When to persist recognized audio
- How to configure an application to store recognized audio
- What APIs to use
- Sample source code
When to persist recognized audio
The following scenarios typically need to store the WAV audio recognized by the SR engine:
- Transcription applications (for example, convert voice mail to email)
- Audio correction user interface (for example, replay and/or re-recognize audio snippets)
- SR engine testing (for example, measure and improve engine accuracy with reproducible audio input data)
Configure an application to store audio from speech recognition
Follow these basic steps to retrieve and store recognized WAV audio:
- Create an SR engine.
- Enable retained audio on the relevant recognition context.
- Set the retained audio format (specify lower quality for smaller storage size, higher quality for clearer audio). The default is the SR engine's audio format.
- Set up and receive recognition events for the relevant recognition context.
- Retrieve the audio stream from the recognition result.
- Copy the recognition result's audio stream to the file-bound stream.
Back to top
APIs to persist or store recognized audio to WAV files
Applications typically use the following APIs to store audio that was recognized by the SR engine to a WAV file.
API | What it does |
---|---|
SpStream object, ISpStream interface | The basic audio stream in the Speech Platform |
ISpStream::BindToFile | Setup an audio stream for WAV file input |
SpBindToFile | Helper function to setup a stream with a WAV file |
ISpRecoContext::SetAudioOptions | Enable or disable audio retention |
ISpRecoResult::GetAudio | Retrieves recognized audio |
ISpStreamFormat::GetFormat | Retrieve the format of recognized audio |
CSpStreamFormat | Helper object for handling audio formats |
ISpStream::Read ISpStream::Write |
Methods for reading and writing stream data (inherited from IStream) |
SPEI_RECOGNITION SPEI_FALSE_RECOGNITION |
Events sent by the Speech Platform when a recognition or a false recognition has occurred |
Table 1. APIs for persisting recognized audio
Back to top
Example
The following example initializes a recognizer, subscribes to the recognition event, and sets the recognizer to receive input from the default audio input for the system, typically a microphone. The example specifies the format in which to store audio and instructs the recognizer to retain the audio that corresponds to its recognition results. With the recognizer configured, the example loads a grammar, activates its rules, and starts recognition. When a phrase that matches the grammar is recognized, the recognizer stores the audio associated with the recognition in the specified format. Finally, the example plays back the captured audio.
`
int _tmain(int argc, _TCHAR* argv[]) { CoInitialize(NULL); { HRESULT hr = S_OK;`Back to top// Find the best matching installed en-us recognizer. CComPtr<ISpObjectToken> cpRecognizerToken; if (SUCCEEDED(hr)) { hr = SpFindBestToken(SPCAT_RECOGNIZERS, L"language=409", NULL, &cpRecognizerToken;); } // Create the in-process recognizer and immediately set its state to inactive. CComPtr<ISpRecognizer> cpRecognizer; if (SUCCEEDED(hr)) { hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer); } if (SUCCEEDED(hr)) { hr = cpRecognizer->SetRecognizer(cpRecognizerToken); } if (SUCCEEDED(hr)) { hr = cpRecognizer->SetRecoState(SPRST_INACTIVE); } // Create a new recognition context from the recognizer. CComPtr<ISpRecoContext> cpContext; if (SUCCEEDED(hr)) { hr = cpRecognizer->CreateRecoContext(&cpContext;); } // Subscribe to the speech recognition event and end stream event. if (SUCCEEDED(hr)) { ULONGLONG ullEventInterest = SPFEI(SPEI_RECOGNITION); hr = cpContext->SetInterest(ullEventInterest, ullEventInterest); } // Establish a Win32 event to signal when speech events are available. HANDLE hSpeechNotifyEvent = INVALID_HANDLE_VALUE; if (SUCCEEDED(hr)) { hr = cpContext->SetNotifyWin32Event(); } if (SUCCEEDED(hr)) { hSpeechNotifyEvent = cpContext->GetNotifyEventHandle(); if (INVALID_HANDLE_VALUE == hSpeechNotifyEvent) { // Notification handle unsupported. hr = E_NOINTERFACE; } } // Initialize an audio object to use the default audio input of the system and set the recognizer to use it. CComPtr<ISpAudio> cpAudioIn; if (SUCCEEDED(hr)) { hr = cpAudioIn.CoCreateInstance(CLSID_SpMMAudioIn); } if (SUCCEEDED(hr)) { hr = cpRecognizer->SetInput(cpAudioIn, TRUE); } // Populate a WAVEFORMATEX struct with our desired output audio format. information. WAVEFORMATEX* pWfexCoMemRetainedAudioFormat = NULL; GUID guidRetainedAudioFormat = GUID_NULL; if (SUCCEEDED(hr)) { hr = SpConvertStreamFormatEnum(SPSF_16kHz16BitMono, &guidRetainedAudioFormat;, &pWfexCoMemRetainedAudioFormat;); } // Instruct the recognizer to retain the audio from its recognition results. if (SUCCEEDED(hr)) { hr = cpContext->SetAudioOptions(SPAO_RETAIN_AUDIO, &guidRetainedAudioFormat;, pWfexCoMemRetainedAudioFormat); } if (NULL != pWfexCoMemRetainedAudioFormat) { CoTaskMemFree(pWfexCoMemRetainedAudioFormat); } // Create a new grammar and load an SRGS grammar from file. CComPtr<ISpRecoGrammar> cpGrammar; if (SUCCEEDED(hr)) { hr = cpContext->CreateGrammar(0, &cpGrammar;); } if (SUCCEEDED(hr)) { hr = cpGrammar->LoadCmdFromFile(L"C:\\Test\\FindServices.grxml", SPLO_STATIC); } // Set all top-level rules in the new grammar to the active state. if (SUCCEEDED(hr)) { hr = cpGrammar->SetRuleState(NULL, NULL, SPRS_ACTIVE); } // Set the recognizer state to active to begin recognition. if (SUCCEEDED(hr)) { hr = cpRecognizer->SetRecoState(SPRST_ACTIVE_ALWAYS); } // Establish a separate Win32 event to signal the event loop exit. HANDLE hExitEvent = CreateEvent(NULL, FALSE, FALSE, NULL); // Collect the events listened for to pump the speech event loop. HANDLE rghEvents[] = { hSpeechNotifyEvent, hExitEvent }; // Speech recognition event loop. BOOL fContinue = TRUE; while (fContinue && SUCCEEDED(hr)) { // Wait for either a speech event or an exit event, with a 15 second timeout. DWORD dwMessage = WaitForMultipleObjects(sp_countof(rghEvents), rghEvents, FALSE, 15000); switch (dwMessage) { // With the WaitForMultipleObjects call above, WAIT_OBJECT_0 is a speech event from hSpeechNotifyEvent. case WAIT_OBJECT_0: { // Sequentially grab the available speech events from the speech event queue. CSpEvent spevent; while (S_OK == spevent.GetFrom(cpContext)) { switch (spevent.eEventId) { case SPEI_RECOGNITION: { // Retrieve the recognition result and output the text of that result. ISpRecoResult* pResult = spevent.RecoResult(); LPWSTR pszCoMemResultText = NULL; hr = pResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &pszCoMemResultText;, NULL); if (SUCCEEDED(hr)) { wprintf(L"Recognition event received, text=\"%s\"\r\n", pszCoMemResultText); } // Also retrieve the retained audio we requested. CComPtr<ISpStreamFormat> cpRetainedAudio; if (SUCCEEDED(hr)) { hr = pResult->GetAudio(0, 0, &cpRetainedAudio;); } // To demonstrate, we'll speak the retained audio back using ISpVoice. CComPtr<ISpVoice> cpVoice; if (SUCCEEDED(hr)) { hr = cpVoice.CoCreateInstance(CLSID_SpVoice); } if (SUCCEEDED(hr)) { hr = cpVoice->SpeakStream(cpRetainedAudio, SPF_DEFAULT, 0); } if (NULL != pszCoMemResultText) { CoTaskMemFree(pszCoMemResultText); } break; } } } break; } case WAIT_OBJECT_0 + 1: case WAIT_TIMEOUT: { // Exit event or timeout; discontinue the speech loop. fContinue = FALSE; break; } } } // Pause to prevent application exit. wprintf(L"Press any key to exit!\r\n"); getchar(); } CoUninitialize(); return 0;
}
Sample XML Grammar
The following is the grammar referenced in the code example above. It can be used to recognize phrases such as "Find restaurants near Madrid". The code example sets the input for to recognizer to the system default, so you can test the grammar by speaking into a microphone connected to your computer.
`
<?xml version="1.0" encoding="utf-8"?> <grammar version="1.0" xml:lang="en-US" mode="voice" root="findServices" xmlns="http://www.w3.org/2001/06/grammar" tag-format="semantics/1.0">`Back to top<rule id="findServices"> <item> Find </item> <ruleref uri="#services"/> <item> near </item> <ruleref uri="#city"/> </rule>
<rule id="services"> <one-of> <item> restaurants </item> <item> gas stations </item> <item> coffee </item> </one-of> </rule>
<rule id="city"> <one-of> <item> Seattle </item> <item> Madrid </item> <item> London </item> </one-of> </rule>
</grammar>