#include "speech.h"
Speech::Speech():
mKinectAudioStream(
NULL),
{
FBString lBinPath( FBSystem().ApplicationPath );
FBString lFilePath = lBinPath + "//..//..//OpenRealitySDK//Scenes//speechwords.grxml";
size_t lNewSize = strlen(lFilePath) + 1;
wchar_t* lWFilePath = new wchar_t[lNewSize];
size_t lConvertedChars = 0;
mbstowcs_s(&lConvertedChars, lWFilePath, lNewSize, lFilePath,
_TRUNCATE);
mGrammarFileName = lWFilePath;
}
Speech::~Speech()
{
delete[] mGrammarFileName;
SafeRelease(mKinectAudioStream);
SafeRelease(mSpeechStream);
SafeRelease(mSpeechRecognizer);
SafeRelease(mSpeechContext);
SafeRelease(mSpeechGrammar);
}
HRESULT Speech::CreateFirstConnected(
INuiSensor* pSensor)
{
mNuiSensor = pSensor;
HRESULT hr;
{
SetStatusMessage("No ready Kinect found! \n\nBut you can still use this plugin without voice control feature.");
return E_FAIL;
}
hr = InitializeAudioStream();
if (FAILED(hr))
{
SetStatusMessage("Could not initialize audio stream. \n\nBut you can still use this plugin without voice control feature.");
return hr;
}
hr = CreateSpeechRecognizer();
if (FAILED(hr))
{
SetStatusMessage("Could not create speech recognizer. Please ensure that Microsoft Speech SDK and other sample requirements are installed. \n\nBut you can still use this plugin without voice control feature.");
return hr;
}
hr = LoadSpeechGrammar();
if (FAILED(hr))
{
SetStatusMessage("Could not load speech grammar. Please ensure that grammar configuration file was properly deployed. \n\nBut you can still use this plugin without voice control feature.");
return hr;
}
hr = StartSpeechRecognition();
if (FAILED(hr))
{
SetStatusMessage("Could not start recognizing speech. \n\nBut you can still use this plugin without voice control feature.");
return hr;
}
mValid = true;
return hr;
}
HRESULT Speech::InitializeAudioStream()
{
{
SetStatusMessage("No ready Kinect found!");
return E_FAIL;
}
IMediaObject* pDMO =
NULL;
IPropertyStore* pPropertyStore =
NULL;
HRESULT hr = mNuiSensor->NuiGetAudioSource(&pNuiAudioSource);
if (SUCCEEDED(hr))
{
hr = pNuiAudioSource->QueryInterface(IID_IMediaObject, (void**)&pDMO);
if (SUCCEEDED(hr))
{
hr = pNuiAudioSource->QueryInterface(IID_IPropertyStore, (void**)&pPropertyStore);
PROPVARIANT pvSysMode;
PropVariantInit(&pvSysMode);
pvSysMode.vt = VT_I4;
pvSysMode.lVal = (LONG)(2);
pPropertyStore->SetValue(MFPKEY_WMAAECMA_SYSTEM_MODE, pvSysMode);
PropVariantClear(&pvSysMode);
WAVEFORMATEX wfxOut = {AudioFormat, AudioChannels, AudioSamplesPerSecond, AudioAverageBytesPerSecond, AudioBlockAlign, AudioBitsPerSample, 0};
DMO_MEDIA_TYPE mt = {0};
MoInitMediaType(&mt, sizeof(WAVEFORMATEX));
mt.majortype = MEDIATYPE_Audio;
mt.subtype = MEDIASUBTYPE_PCM;
mt.lSampleSize = 0;
mt.bFixedSizeSamples = TRUE;
mt.bTemporalCompression = FALSE;
mt.formattype = FORMAT_WaveFormatEx;
memcpy(mt.pbFormat, &wfxOut, sizeof(WAVEFORMATEX));
hr = pDMO->SetOutputType(0, &mt, 0);
if (SUCCEEDED(hr))
{
mKinectAudioStream = new KinectAudioStream(pDMO);
hr = mKinectAudioStream->QueryInterface(IID_IStream, (void**)&pStream);
if (SUCCEEDED(hr))
{
hr = CoCreateInstance(CLSID_SpStream,
NULL, CLSCTX_INPROC_SERVER, __uuidof(ISpStream), (
void**)&mSpeechStream);
if (SUCCEEDED(hr))
{
hr = mSpeechStream->SetBaseStream(pStream, SPDFID_WaveFormatEx, &wfxOut);
}
}
}
MoFreeMediaType(&mt);
}
}
SafeRelease(pStream);
SafeRelease(pPropertyStore);
SafeRelease(pDMO);
SafeRelease(pNuiAudioSource);
return hr;
}
HRESULT Speech::StartSpeechRecognition()
{
HRESULT hr = mKinectAudioStream->StartCapture();
if (SUCCEEDED(hr))
{
mSpeechGrammar->SetRuleState(
NULL,
NULL, SPRS_ACTIVE);
mSpeechRecognizer->SetRecoState(SPRST_ACTIVE_ALWAYS);
mSpeechContext->SetInterest(SPFEI(SPEI_RECOGNITION), SPFEI(SPEI_RECOGNITION));
hr = mSpeechContext->Resume(0);
}
return hr;
}
HRESULT Speech::LoadSpeechGrammar()
{
HRESULT hr = mSpeechContext->CreateGrammar(1, &mSpeechGrammar);
if (SUCCEEDED(hr))
{
TCHAR lBuffer[1000];
GetFullPathName(mGrammarFileName, 1000, lBuffer,
NULL);
hr = mSpeechGrammar->LoadCmdFromFile(lBuffer, SPLO_STATIC);
}
return hr;
}
SpeechCommands Speech::Process()
{
if( !mValid )
return eNoCommand;
const float ConfidenceThreshold = 0.3f;
SPEVENT curEvent;
ULONG fetched = 0;
HRESULT hr = S_OK;
mSpeechContext->GetEvents(1, &curEvent, &fetched);
while (fetched > 0)
{
switch (curEvent.eEventId)
{
case SPEI_RECOGNITION:
if (SPET_LPARAM_IS_OBJECT == curEvent.elParamType)
{
ISpRecoResult*
result =
reinterpret_cast<ISpRecoResult*
>(curEvent.lParam);
SPPHRASE* pPhrase =
NULL;
hr = result->GetPhrase(&pPhrase);
if (SUCCEEDED(hr))
{
if ((pPhrase->pProperties !=
NULL) && (pPhrase->pProperties->pFirstChild !=
NULL))
{
const SPPHRASEPROPERTY* pSemanticTag = pPhrase->pProperties->pFirstChild;
if (pSemanticTag->SREngineConfidence > ConfidenceThreshold)
{
SpeechCommands action = MapSpeechTagToAction(pSemanticTag->pszValue);
return action;
}
}
::CoTaskMemFree(pPhrase);
}
}
break;
}
mSpeechContext->GetEvents(1, &curEvent, &fetched);
}
return eNoCommand;
}
bool Speech::IsValid()
{
return mValid;
}
SpeechCommands Speech::MapSpeechTagToAction( LPCWSTR pszSpeechTag )
{
struct SpeechTagToAction
{
LPCWSTR pszSpeechTag;
SpeechCommands action;
};
const SpeechTagToAction Map[] =
{
{L"RECORD_START", eStartRecording},
{L"RECORD_STOP", eStopRecording},
{L"PLAY", ePlay},
{L"RESET", eReset}
};
SpeechCommands action = eNoCommand;
for (
int i = 0;
i < _countof(Map); ++
i)
{
if (0 == wcscmp(Map[
i].pszSpeechTag, pszSpeechTag))
{
break;
}
}
return action;
}
HRESULT Speech::CreateSpeechRecognizer()
{
ISpObjectToken *pEngineToken =
NULL;
HRESULT hr = CoCreateInstance(CLSID_SpInprocRecognizer,
NULL, CLSCTX_INPROC_SERVER, __uuidof(ISpRecognizer), (
void**)&mSpeechRecognizer);
if (SUCCEEDED(hr))
{
mSpeechRecognizer->SetInput(mSpeechStream, FALSE);
hr = SpFindBestToken(SPCAT_RECOGNIZERS,L
"Language=409;Kinect=True",
NULL,&pEngineToken);
if (SUCCEEDED(hr))
{
mSpeechRecognizer->SetRecognizer(pEngineToken);
hr = mSpeechRecognizer->CreateRecoContext(&mSpeechContext);
}
}
SafeRelease(pEngineToken);
return hr;
}
void Speech::SetStatusMessage(const char* szMessage)
{
szMessage,
"OK");
}
void Speech::Stop()
{
if( mKinectAudioStream )
mKinectAudioStream->StopCapture();
if( mSpeechStream )
mSpeechStream->Close();
mValid = false;
}