devices/devicemocap/speech/speech.cxx

devices/devicemocap/speech/speech.cxx
/***************************************************************************************
Autodesk(R) Open Reality(R) Samples
(C) 2013 Autodesk, Inc. and/or its licensors
All rights reserved.
AUTODESK SOFTWARE LICENSE AGREEMENT
Autodesk, Inc. licenses this Software to you only upon the condition that
you accept all of the terms contained in the Software License Agreement ("Agreement")
that is embedded in or that is delivered with this Software. By selecting
the "I ACCEPT" button at the end of the Agreement or by copying, installing,
uploading, accessing or using all or any portion of the Software you agree
to enter into the Agreement. A contract is then formed between Autodesk and
either you personally, if you acquire the Software for yourself, or the company
or other legal entity for which you are acquiring the software.
AUTODESK, INC., MAKES NO WARRANTY, EITHER EXPRESS OR IMPLIED, INCLUDING BUT
NOT LIMITED TO ANY IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR
PURPOSE REGARDING THESE MATERIALS, AND MAKES SUCH MATERIALS AVAILABLE SOLELY ON AN
"AS-IS" BASIS.
IN NO EVENT SHALL AUTODESK, INC., BE LIABLE TO ANYONE FOR SPECIAL, COLLATERAL,
INCIDENTAL, OR CONSEQUENTIAL DAMAGES IN CONNECTION WITH OR ARISING OUT OF PURCHASE
OR USE OF THESE MATERIALS. THE SOLE AND EXCLUSIVE LIABILITY TO AUTODESK, INC.,
REGARDLESS OF THE FORM OF ACTION, SHALL NOT EXCEED THE PURCHASE PRICE OF THE
MATERIALS DESCRIBED HEREIN.
Autodesk, Inc., reserves the right to revise and improve its products as it sees fit.
Autodesk and Open Reality are registered trademarks or trademarks of Autodesk, Inc.,
in the U.S.A. and/or other countries. All other brand names, product names, or
trademarks belong to their respective holders.
GOVERNMENT USE
Use, duplication, or disclosure by the U.S. Government is subject to restrictions as
set forth in FAR 12.212 (Commercial Computer Software-Restricted Rights) and
DFAR 227.7202 (Rights in Technical Data and Computer Software), as applicable.
Manufacturer is Autodesk, Inc., 10 Duke Street, Montreal, Quebec, Canada, H3C 2L7.
***************************************************************************************/
#include "speech.h"
#include <fbsdk/fbsdk.h>
Speech::Speech():
mNuiSensor(NULL),
mKinectAudioStream(NULL),
mSpeechStream(NULL),
mSpeechRecognizer(NULL),
mSpeechContext(NULL),
mSpeechGrammar(NULL),
mValid(false)
{
// Get the path for grammar file
FBString lBinPath( FBSystem().ApplicationPath );
FBString lFilePath = lBinPath + "//..//..//OpenRealitySDK//Scenes//speechwords.grxml";
size_t lNewSize = strlen(lFilePath) + 1;
wchar_t* lWFilePath = new wchar_t[lNewSize];
size_t lConvertedChars = 0;
mbstowcs_s(&lConvertedChars, lWFilePath, lNewSize, lFilePath, _TRUNCATE);
mGrammarFileName = lWFilePath;
}
Speech::~Speech()
{
delete[] mGrammarFileName;
SafeRelease(mKinectAudioStream);
SafeRelease(mSpeechStream);
SafeRelease(mSpeechRecognizer);
SafeRelease(mSpeechContext);
SafeRelease(mSpeechGrammar);
}
HRESULT Speech::CreateFirstConnected(INuiSensor* pSensor)
{
mNuiSensor = pSensor;
HRESULT hr;
if (NULL == mNuiSensor)
{
SetStatusMessage("No ready Kinect found! \n\nBut you can still use this plugin without voice control feature.");
return E_FAIL;
}
hr = InitializeAudioStream();
if (FAILED(hr))
{
SetStatusMessage("Could not initialize audio stream. \n\nBut you can still use this plugin without voice control feature.");
return hr;
}
hr = CreateSpeechRecognizer();
if (FAILED(hr))
{
SetStatusMessage("Could not create speech recognizer. Please ensure that Microsoft Speech SDK and other sample requirements are installed. \n\nBut you can still use this plugin without voice control feature.");
return hr;
}
hr = LoadSpeechGrammar();
if (FAILED(hr))
{
SetStatusMessage("Could not load speech grammar. Please ensure that grammar configuration file was properly deployed. \n\nBut you can still use this plugin without voice control feature.");
return hr;
}
hr = StartSpeechRecognition();
if (FAILED(hr))
{
SetStatusMessage("Could not start recognizing speech. \n\nBut you can still use this plugin without voice control feature.");
return hr;
}
mValid = true;
return hr;
}
HRESULT Speech::InitializeAudioStream()
{
// Get the audio source
if (NULL == mNuiSensor)
{
SetStatusMessage("No ready Kinect found!");
return E_FAIL;
}
INuiAudioBeam* pNuiAudioSource = NULL;
IMediaObject* pDMO = NULL;
IPropertyStore* pPropertyStore = NULL;
IStream* pStream = NULL;
HRESULT hr = mNuiSensor->NuiGetAudioSource(&pNuiAudioSource);
if (SUCCEEDED(hr))
{
hr = pNuiAudioSource->QueryInterface(IID_IMediaObject, (void**)&pDMO);
if (SUCCEEDED(hr))
{
hr = pNuiAudioSource->QueryInterface(IID_IPropertyStore, (void**)&pPropertyStore);
// Set AEC-MicArray DMO system mode. This must be set for the DMO to work properly.
// Possible values are:
// SINGLE_CHANNEL_AEC = 0
// OPTIBEAM_ARRAY_ONLY = 2
// OPTIBEAM_ARRAY_AND_AEC = 4
// SINGLE_CHANNEL_NSAGC = 5
PROPVARIANT pvSysMode;
PropVariantInit(&pvSysMode);
pvSysMode.vt = VT_I4;
pvSysMode.lVal = (LONG)(2); // Use OPTIBEAM_ARRAY_ONLY setting. Set OPTIBEAM_ARRAY_AND_AEC instead if you expect to have sound playing from speakers.
pPropertyStore->SetValue(MFPKEY_WMAAECMA_SYSTEM_MODE, pvSysMode);
PropVariantClear(&pvSysMode);
// Set DMO output format
WAVEFORMATEX wfxOut = {AudioFormat, AudioChannels, AudioSamplesPerSecond, AudioAverageBytesPerSecond, AudioBlockAlign, AudioBitsPerSample, 0};
DMO_MEDIA_TYPE mt = {0};
MoInitMediaType(&mt, sizeof(WAVEFORMATEX));
mt.majortype = MEDIATYPE_Audio;
mt.subtype = MEDIASUBTYPE_PCM;
mt.lSampleSize = 0;
mt.bFixedSizeSamples = TRUE;
mt.bTemporalCompression = FALSE;
mt.formattype = FORMAT_WaveFormatEx;
memcpy(mt.pbFormat, &wfxOut, sizeof(WAVEFORMATEX));
hr = pDMO->SetOutputType(0, &mt, 0);
if (SUCCEEDED(hr))
{
mKinectAudioStream = new KinectAudioStream(pDMO);
hr = mKinectAudioStream->QueryInterface(IID_IStream, (void**)&pStream);
if (SUCCEEDED(hr))
{
hr = CoCreateInstance(CLSID_SpStream, NULL, CLSCTX_INPROC_SERVER, __uuidof(ISpStream), (void**)&mSpeechStream);
if (SUCCEEDED(hr))
{
hr = mSpeechStream->SetBaseStream(pStream, SPDFID_WaveFormatEx, &wfxOut);
}
}
}
MoFreeMediaType(&mt);
}
}
SafeRelease(pStream);
SafeRelease(pPropertyStore);
SafeRelease(pDMO);
SafeRelease(pNuiAudioSource);
return hr;
}
HRESULT Speech::StartSpeechRecognition()
{
HRESULT hr = mKinectAudioStream->StartCapture();
if (SUCCEEDED(hr))
{
// Specify that all top level rules in grammar are now active
mSpeechGrammar->SetRuleState(NULL, NULL, SPRS_ACTIVE);
// Specify that engine should always be reading audio
mSpeechRecognizer->SetRecoState(SPRST_ACTIVE_ALWAYS);
// Specify that we're only interested in receiving recognition events
mSpeechContext->SetInterest(SPFEI(SPEI_RECOGNITION), SPFEI(SPEI_RECOGNITION));
// Ensure that engine is recognizing speech and not in paused state
hr = mSpeechContext->Resume(0);
}
return hr;
}
HRESULT Speech::LoadSpeechGrammar()
{
HRESULT hr = mSpeechContext->CreateGrammar(1, &mSpeechGrammar);
if (SUCCEEDED(hr))
{
// Populate recognition grammar from file
TCHAR lBuffer[1000];
GetFullPathName(mGrammarFileName, 1000, lBuffer, NULL);
hr = mSpeechGrammar->LoadCmdFromFile(lBuffer, SPLO_STATIC);
}
return hr;
}
SpeechCommands Speech::Process()
{
if( !mValid )
return eNoCommand;
const float ConfidenceThreshold = 0.3f;
SPEVENT curEvent;
ULONG fetched = 0;
HRESULT hr = S_OK;
mSpeechContext->GetEvents(1, &curEvent, &fetched);
while (fetched > 0)
{
switch (curEvent.eEventId)
{
case SPEI_RECOGNITION:
if (SPET_LPARAM_IS_OBJECT == curEvent.elParamType)
{
// this is an ISpRecoResult
ISpRecoResult* result = reinterpret_cast<ISpRecoResult*>(curEvent.lParam);
SPPHRASE* pPhrase = NULL;
hr = result->GetPhrase(&pPhrase);
if (SUCCEEDED(hr))
{
if ((pPhrase->pProperties != NULL) && (pPhrase->pProperties->pFirstChild != NULL))
{
const SPPHRASEPROPERTY* pSemanticTag = pPhrase->pProperties->pFirstChild;
if (pSemanticTag->SREngineConfidence > ConfidenceThreshold)
{
SpeechCommands action = MapSpeechTagToAction(pSemanticTag->pszValue);
return action;
}
}
::CoTaskMemFree(pPhrase);
}
}
break;
}
mSpeechContext->GetEvents(1, &curEvent, &fetched);
}
return eNoCommand;
}
bool Speech::IsValid()
{
return mValid;
}
SpeechCommands Speech::MapSpeechTagToAction( LPCWSTR pszSpeechTag )
{
struct SpeechTagToAction
{
LPCWSTR pszSpeechTag;
SpeechCommands action;
};
const SpeechTagToAction Map[] =
{
{L"RECORD_START", eStartRecording},
{L"RECORD_STOP", eStopRecording},
{L"PLAY", ePlay},
{L"RESET", eReset}
};
SpeechCommands action = eNoCommand;
for (int i = 0; i < _countof(Map); ++i)
{
if (0 == wcscmp(Map[i].pszSpeechTag, pszSpeechTag))
{
action = Map[i].action;
break;
}
}
return action;
}
HRESULT Speech::CreateSpeechRecognizer()
{
ISpObjectToken *pEngineToken = NULL;
HRESULT hr = CoCreateInstance(CLSID_SpInprocRecognizer, NULL, CLSCTX_INPROC_SERVER, __uuidof(ISpRecognizer), (void**)&mSpeechRecognizer);
if (SUCCEEDED(hr))
{
mSpeechRecognizer->SetInput(mSpeechStream, FALSE);
hr = SpFindBestToken(SPCAT_RECOGNIZERS,L"Language=409;Kinect=True",NULL,&pEngineToken);
if (SUCCEEDED(hr))
{
mSpeechRecognizer->SetRecognizer(pEngineToken);
hr = mSpeechRecognizer->CreateRecoContext(&mSpeechContext);
// For long recognition sessions (a few hours or more), it may be beneficial to turn off adaptation of the acoustic model.
// This will prevent recognition accuracy from degrading over time.
//if (SUCCEEDED(hr))
//{
// hr = m_pSpeechRecognizer->SetPropertyNum(L"AdaptationOn", 0);
//}
}
}
SafeRelease(pEngineToken);
return hr;
}
void Speech::SetStatusMessage(const char* szMessage)
{
FBMessageBox( "Mocap Device Error:",
szMessage,
"OK");
}
void Speech::Stop()
{
if( mKinectAudioStream )
mKinectAudioStream->StopCapture();
if( mSpeechStream )
mSpeechStream->Close();
mValid = false;
}