Freigeben über


Note

Please see Azure Cognitive Services for Speech documentation for the latest supported speech solutions.

Microsoft Speech Platform

Use WAV File Input for Speech Recognition

Applications can use a WAV file as the audio input source to perform speech recognition in the Microsoft Speech Platform. This topic presents the following information about performing speech recognition from WAV files:

  • When to use WAV files as input
  • How to configure an application for speech recognition from a WAV file
  • What APIs to use
  • Sample source code

When to use WAV files as input

Speech recognition (SR) applications typically use the following audio input configurations:

  • A telephony card communicating with one or more SR engines
  • Sending audio from a persisted WAV file to an SR engine

The telephony scenario can use either the Speech Platform's standard multimedia audio input object or a custom audio object combined with an SR engine.

The WAV file input scenario uses controlled, reproducible audio input and requires a dedicated SR engine. The file input scenario should use a generic Speech Platform audio stream connected to the input WAV file.

Typical scenarios that would use a WAV file as the audio input for speech recognition include the following:

  • Offline transcription applications (for example, convert voice mail to email)
  • SR engine testing (for example, measure and improve engine accuracy with reproducible audio input data)
  • SR application testing (for example, verify and improve application behavior when responding to reproducible voice commands)

Back to top

Configure an application for speech recognition from a WAV file

Follow these basic steps to perform speech recognition on a WAV file:

  1. Create and configure a Speech Platform audio stream object for WAV file input.
  2. Create an SR engine using the code samples in this document.
  3. Set the audio stream object from step 1 as the SR engine's input.
  4. Activate grammars and begin speech recognition.
  5. Respond to recognition events until end of audio stream is reached.

APIs to use for speech recognition from WAV file input

Applications typically use the following APIs to recognize audio from a WAV file.

API What it does
ISpStream The basic audio stream in the Speech Platform
ISpStream::BindToFile Setup an audio stream for WAV file input
SpBindToFile Helper function to setup a stream with a WAV file
ISpRecognizer Manage the speech recognition (SR) engine
ISpRecognizer::SetInput Set the stream object as the SR engine's input
SPEI_START_SR_STREAM, SPEI_END_SR_STREAM Events that are raised when the SR engine has reached the beginning or the end of the WAV file, respectively

Back to top

Example

The following code example in COM/C++ creates a console application that performs speech recognition using a WAV file as input. For recognition to be successful, the speech contents of the WAV file must match the grammar loaded by the speech recognition engine. For example, the "FlightDestination.grxml" grammar referenced in the example recognizes phrases such as "I want to fly to Boston". See the end of the example for the contents of FlightDestination.grxml.

`

int _tmain(int argc, _TCHAR* argv[])
{
CoInitialize(NULL);
{
HRESULT hr = S_OK;

    // Find the best matching installed en-us recognizer.
    CComPtr<ISpObjectToken> cpRecognizerToken;

    if (SUCCEEDED(hr))
    {
        hr = SpFindBestToken(SPCAT_RECOGNIZERS, L"language=409", NULL, &cpRecognizerToken;);
    }

    // Create a recognizer and immediately set its state to inactive.
    CComPtr<ISpRecognizer> cpRecognizer;

    if (SUCCEEDED(hr))
    {
        hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer);
    }

    if (SUCCEEDED(hr))
    {
        hr = cpRecognizer->SetRecognizer(cpRecognizerToken);
    }

    if (SUCCEEDED(hr))
    {
        hr = cpRecognizer->SetRecoState(SPRST_INACTIVE);
    }

    // Create a new recognition context from the recognizer.
    CComPtr<ISpRecoContext> cpContext;

    if (SUCCEEDED(hr))
    {
        hr = cpRecognizer->CreateRecoContext(&cpContext;);
    }

    // Subscribe to the speech recognition event and end stream event.
    if (SUCCEEDED(hr))
    {
        ULONGLONG ullEventInterest = SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_END_SR_STREAM);
        hr = cpContext->SetInterest(ullEventInterest, ullEventInterest);
    }

    // Establish a Win32 event to signal when speech events are available.
    HANDLE hSpeechNotifyEvent = INVALID_HANDLE_VALUE;

    if (SUCCEEDED(hr))
    {
        hr = cpContext->SetNotifyWin32Event();
    }

    if (SUCCEEDED(hr))
    {
        hSpeechNotifyEvent = cpContext->GetNotifyEventHandle();

        if (INVALID_HANDLE_VALUE == hSpeechNotifyEvent)
        {
            // Notification handle unsupported.
            hr = E_NOINTERFACE;
        }
    }

    // Set up an audio input stream using a .wav file and set the recognizer's input.
    CComPtr<ISpStream> cpInputStream;

    if (SUCCEEDED(hr))
    {
        hr = SPBindToFile(L"C:\\Test\\FlightDestination.wav", SPFM_OPEN_READONLY, &cpInputStream;);
    }

    if (SUCCEEDED(hr))
    {
        hr = cpRecognizer->SetInput(cpInputStream, TRUE);
    }

    // Create a new grammar and load an SRGS grammar from file.
    CComPtr<ISpRecoGrammar> cpGrammar;

    if (SUCCEEDED(hr))
    {
        hr = cpContext->CreateGrammar(0, &cpGrammar;);
    }

    if (SUCCEEDED(hr))
    {
        hr = cpGrammar->LoadCmdFromFile(L"C:\\Test\\FlightDestination.grxml", SPLO_STATIC);
    }

    // Set all top-level rules in the new grammar to the active state.
    if (SUCCEEDED(hr))
    {
        hr = cpGrammar->SetRuleState(NULL, NULL, SPRS_ACTIVE);
    }

    // Set the recognizer state to active to begin recognition.
    if (SUCCEEDED(hr))
    {
        hr = cpRecognizer->SetRecoState(SPRST_ACTIVE_ALWAYS);
    }    

    // Establish a separate win32 event to signal event loop exit.
    HANDLE hExitEvent = CreateEvent(NULL, FALSE, FALSE, NULL);

    // Collect the events listened for to pump the speech event loop.
    HANDLE rghEvents[] = { hSpeechNotifyEvent, hExitEvent };

    // Speech recognition event loop.
    BOOL fContinue = TRUE;

    while (fContinue && SUCCEEDED(hr))
    {
        // Wait for either a speech event or an exit event.
        DWORD dwMessage = WaitForMultipleObjects(sp_countof(rghEvents), rghEvents, FALSE, INFINITE);

        switch (dwMessage)
        {
            // With the WaitForMultipleObjects call above, WAIT_OBJECT_0 is a speech event from hSpeechNotifyEvent.
            case WAIT_OBJECT_0: 
            {
                // Sequentially grab the available speech events from the speech event queue.
                CSpEvent spevent;

                while (S_OK == spevent.GetFrom(cpContext))
                {
                    switch (spevent.eEventId)
                    {
                        case SPEI_RECOGNITION:
                        {
                            // Retrieve the recognition result and output the text of that result.
                            ISpRecoResult* pResult = spevent.RecoResult();

                            LPWSTR pszCoMemResultText = NULL;
                            hr = pResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &pszCoMemResultText;, NULL);

                            if (SUCCEEDED(hr))
                            {
                                wprintf(L"Recognition event received, text=\"%s\"\r\n", pszCoMemResultText);
                            }

                            if (NULL != pszCoMemResultText)
                            {
                                CoTaskMemFree(pszCoMemResultText);
                            }

                            break;
                        }
                        case SPEI_END_SR_STREAM:
                        {
                            // The stream has ended; signal the exit event if it hasn't been signaled already.
                            wprintf(L"End stream event received\r\n");
                            SetEvent(hExitEvent);
                            break;
                        }
                    }
                }

                break;
            }
            case WAIT_OBJECT_0 + 1:
            {
                // Exit event; discontinue the speech loop.
                fContinue = FALSE;
                break;
            }
        }
    }

    // Pause to prevent application exit.
    wprintf(L"Press any key to exit!\r\n");
    getchar();
}
CoUninitialize();

return 0;

}

`

The following are the contents of the grammar FlightDestination.grxml.

`

<?xml version="1.0" encoding="utf-8"?>
<grammar version="1.0" xml:lang="en-US" mode="voice" root="destination"
xmlns="http://www.w3.org/2001/06/grammar" tag-format="semantics/1.0">

<rule id="destination"> <item> I want to fly to </item> <ruleref uri="#city"/> </rule>

<rule id="city"> <one-of> <item> Boston </item> <item> Madrid </item> <item> London </item> </one-of> </rule>

</grammar>

`Back to top