|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <string.h>
|
|
|
#include <stdlib.h>
|
|
|
#include <algorithm>
|
|
|
#include <math.h>
|
|
|
#include "aed.h"
|
|
|
#include "aed_st.h"
|
|
|
#include "coeff.h"
|
|
|
#include "pitch_est.h"
|
|
|
#include "stft.h"
|
|
|
#include <assert.h>
|
|
|
|
|
|
#define AUP_AED_ALIGN8(o) (((o) + 7) & (~7))
|
|
|
#define AUP_AED_MAX(x, y) (((x) > (y)) ? (x) : (y))
|
|
|
#define AUP_AED_MIN(x, y) (((x) > (y)) ? (y) : (x))
|
|
|
#define AUP_AED_EPS (1e-20f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
AUP_MODULE_AIVAD::AUP_MODULE_AIVAD(char* onnx_path) {
|
|
|
ort_api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
|
|
|
OrtStatus* status =
|
|
|
ort_api->CreateEnv(ORT_LOGGING_LEVEL_WARNING, "TEN-VAD", &ort_env);
|
|
|
if (status) {
|
|
|
printf("Failed to create env: %s\n", ort_api->GetErrorMessage(status));
|
|
|
ort_api->ReleaseStatus(status);
|
|
|
ort_api->ReleaseEnv(ort_env);
|
|
|
ort_env = NULL;
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
OrtSessionOptions* session_options;
|
|
|
ort_api->CreateSessionOptions(&session_options);
|
|
|
ort_api->SetIntraOpNumThreads(session_options, 1);
|
|
|
status =
|
|
|
ort_api->CreateSession(ort_env, onnx_path, session_options, &ort_session);
|
|
|
ort_api->ReleaseSessionOptions(session_options);
|
|
|
if (status) {
|
|
|
printf("Failed to create ort_session: %s\n",
|
|
|
ort_api->GetErrorMessage(status));
|
|
|
ort_api->ReleaseStatus(status);
|
|
|
ort_api->ReleaseEnv(ort_env);
|
|
|
ort_env = NULL;
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
ort_api->GetAllocatorWithDefaultOptions(&ort_allocator);
|
|
|
size_t num_inputs;
|
|
|
ort_api->SessionGetInputCount(ort_session, &num_inputs);
|
|
|
assert(num_inputs == AUP_AED_MODEL_IO_NUM);
|
|
|
for (size_t i = 0; i < num_inputs; i++) {
|
|
|
char* input_name;
|
|
|
ort_api->SessionGetInputName(ort_session, i, ort_allocator, &input_name);
|
|
|
strncpy(input_names_buf[i], input_name, sizeof(input_names_buf[i]));
|
|
|
input_names[i] = input_names_buf[i];
|
|
|
ort_api->AllocatorFree(ort_allocator, input_name);
|
|
|
}
|
|
|
|
|
|
size_t num_outputs;
|
|
|
ort_api->SessionGetOutputCount(ort_session, &num_outputs);
|
|
|
assert(num_outputs == AUP_AED_MODEL_IO_NUM);
|
|
|
for (size_t i = 0; i < num_outputs; i++) {
|
|
|
char* output_name;
|
|
|
ort_api->SessionGetOutputName(ort_session, i, ort_allocator, &output_name);
|
|
|
strncpy(output_names_buf[i], output_name, sizeof(output_names_buf[i]));
|
|
|
output_names[i] = output_names_buf[i];
|
|
|
ort_api->AllocatorFree(ort_allocator, output_name);
|
|
|
}
|
|
|
|
|
|
OrtMemoryInfo* memory_info;
|
|
|
status = ort_api->CreateCpuMemoryInfo(OrtDeviceAllocator, OrtMemTypeDefault,
|
|
|
&memory_info);
|
|
|
if (status != NULL) {
|
|
|
printf("Failed to create memory info: %s\n",
|
|
|
ort_api->GetErrorMessage(status));
|
|
|
ort_api->ReleaseStatus(status);
|
|
|
ort_api->ReleaseSession(ort_session);
|
|
|
ort_api->ReleaseEnv(ort_env);
|
|
|
ort_session = NULL;
|
|
|
ort_env = NULL;
|
|
|
return;
|
|
|
}
|
|
|
int64_t input_shapes0[] = {1, AUP_AED_CONTEXT_WINDOW_LEN, AUP_AED_FEA_LEN};
|
|
|
int64_t input_shapes1234[] = {1, AUP_AED_MODEL_HIDDEN_DIM};
|
|
|
for (int i = 0; i < num_inputs; i++) {
|
|
|
status = ort_api->CreateTensorWithDataAsOrtValue(
|
|
|
memory_info, i == 0 ? input_data_buf_0 : input_data_buf_1234[i - 1],
|
|
|
i == 0 ? sizeof(input_data_buf_0) : sizeof(input_data_buf_1234[i - 1]),
|
|
|
i == 0 ? input_shapes0 : input_shapes1234,
|
|
|
i == 0 ? sizeof(input_shapes0) / sizeof(input_shapes0[0])
|
|
|
: sizeof(input_shapes1234) / sizeof(input_shapes1234[0]),
|
|
|
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &ort_input_tensors[i]);
|
|
|
if (status != NULL) {
|
|
|
printf("Failed to create input tensor %d: %s\n", i,
|
|
|
ort_api->GetErrorMessage(status));
|
|
|
ort_api->ReleaseStatus(status);
|
|
|
ort_api->ReleaseSession(ort_session);
|
|
|
ort_api->ReleaseEnv(ort_env);
|
|
|
ort_session = NULL;
|
|
|
ort_env = NULL;
|
|
|
return;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
int64_t output_shapes0[] = {1, 1, 1};
|
|
|
int64_t output_shapes1234[] = {1, AUP_AED_MODEL_HIDDEN_DIM};
|
|
|
for (int i = 0; i < num_outputs; i++) {
|
|
|
status = ort_api->CreateTensorAsOrtValue(
|
|
|
ort_allocator, i == 0 ? output_shapes0 : output_shapes1234,
|
|
|
i == 0 ? sizeof(output_shapes0) / sizeof(output_shapes0[0])
|
|
|
: sizeof(output_shapes1234) / sizeof(output_shapes1234[0]),
|
|
|
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &ort_output_tensors[i]);
|
|
|
if (status != NULL) {
|
|
|
printf("Failed to create output tensor %d: %s\n", i,
|
|
|
ort_api->GetErrorMessage(status));
|
|
|
ort_api->ReleaseStatus(status);
|
|
|
ort_api->ReleaseSession(ort_session);
|
|
|
ort_api->ReleaseEnv(ort_env);
|
|
|
ort_session = NULL;
|
|
|
ort_env = NULL;
|
|
|
return;
|
|
|
}
|
|
|
}
|
|
|
inited = 1;
|
|
|
}
|
|
|
|
|
|
AUP_MODULE_AIVAD::~AUP_MODULE_AIVAD() {
|
|
|
for (int i = 0; i < AUP_AED_MODEL_IO_NUM; i++) {
|
|
|
if (ort_output_tensors[i]) {
|
|
|
ort_api->ReleaseValue(ort_output_tensors[i]);
|
|
|
}
|
|
|
}
|
|
|
if (ort_session) {
|
|
|
ort_api->ReleaseSession(ort_session);
|
|
|
}
|
|
|
if (ort_env) {
|
|
|
ort_api->ReleaseEnv(ort_env);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
int AUP_MODULE_AIVAD::Process(float* input, float* output) {
|
|
|
if (!inited) {
|
|
|
printf("not inited!\n");
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
memcpy(input_data_buf_0, input, sizeof(input_data_buf_0));
|
|
|
if (clear_hidden) {
|
|
|
memset(input_data_buf_1234, 0, sizeof(input_data_buf_1234));
|
|
|
clear_hidden = 0;
|
|
|
}
|
|
|
OrtStatus* status = ort_api->Run(
|
|
|
ort_session, NULL, input_names, ort_input_tensors, AUP_AED_MODEL_IO_NUM,
|
|
|
output_names, AUP_AED_MODEL_IO_NUM, ort_output_tensors);
|
|
|
float* output_data;
|
|
|
ort_api->GetTensorMutableData(ort_output_tensors[0], (void**)&output_data);
|
|
|
*output = output_data[0];
|
|
|
for (int i = 1; i < AUP_AED_MODEL_IO_NUM; i++) {
|
|
|
ort_api->GetTensorMutableData(ort_output_tensors[i], (void**)&output_data);
|
|
|
memcpy(input_data_buf_1234[i - 1], output_data,
|
|
|
sizeof(input_data_buf_1234[i - 1]));
|
|
|
}
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
int AUP_MODULE_AIVAD::Reset() {
|
|
|
if (!inited) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
clear_hidden = 1;
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
static int AUP_Aed_checkStatCfg(Aed_StaticCfg* pCfg) {
|
|
|
if (pCfg == NULL) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
#if AUP_AED_FEA_LEN < AUP_AED_MEL_FILTER_BANK_NUM
|
|
|
return -1;
|
|
|
#endif
|
|
|
|
|
|
if (pCfg->hopSz < 32) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
if (pCfg->frqInputAvailableFlag == 1) {
|
|
|
if (pCfg->fftSz < 128 || pCfg->fftSz < pCfg->hopSz) {
|
|
|
return -1;
|
|
|
}
|
|
|
if (pCfg->anaWindowSz > pCfg->fftSz || pCfg->anaWindowSz < pCfg->hopSz) {
|
|
|
return -1;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
static int AUP_Aed_publishStaticCfg(Aed_St* stHdl) {
|
|
|
const Aed_StaticCfg* pStatCfg;
|
|
|
|
|
|
if (stHdl == NULL) {
|
|
|
return -1;
|
|
|
}
|
|
|
pStatCfg = (const Aed_StaticCfg*)(&(stHdl->stCfg));
|
|
|
|
|
|
stHdl->extFftSz = 0;
|
|
|
stHdl->extNBins = 0;
|
|
|
stHdl->extWinSz = 0;
|
|
|
if (pStatCfg->frqInputAvailableFlag == 1) {
|
|
|
stHdl->extFftSz = pStatCfg->fftSz;
|
|
|
stHdl->extNBins = (stHdl->extFftSz >> 1) + 1;
|
|
|
stHdl->extWinSz = pStatCfg->anaWindowSz;
|
|
|
}
|
|
|
stHdl->extHopSz = pStatCfg->hopSz;
|
|
|
|
|
|
stHdl->intFftSz = AUP_AED_ASSUMED_FFTSZ;
|
|
|
stHdl->intHopSz = AUP_AED_ASSUMED_HOPSZ;
|
|
|
stHdl->intWinSz = AUP_AED_ASSUMED_WINDOWSZ;
|
|
|
stHdl->intNBins = (stHdl->intFftSz >> 1) + 1;
|
|
|
stHdl->intAnalyWindowPtr = AUP_AED_STFTWindow_Hann768;
|
|
|
|
|
|
if (pStatCfg->frqInputAvailableFlag == 0 ||
|
|
|
stHdl->extHopSz != stHdl->intHopSz) {
|
|
|
|
|
|
stHdl->intAnalyFlag =
|
|
|
2;
|
|
|
} else if (stHdl->extFftSz == stHdl->intFftSz) {
|
|
|
|
|
|
|
|
|
stHdl->intAnalyFlag = 0;
|
|
|
} else {
|
|
|
|
|
|
stHdl->intAnalyFlag =
|
|
|
1;
|
|
|
}
|
|
|
stHdl->inputTimeFIFOLen = stHdl->extHopSz + stHdl->intHopSz;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
stHdl->intAnalyFlag =
|
|
|
2;
|
|
|
|
|
|
stHdl->feaSz = (size_t)AUP_AED_FEA_LEN;
|
|
|
stHdl->melFbSz = (size_t)AUP_AED_MEL_FILTER_BANK_NUM;
|
|
|
stHdl->algDelay = (size_t)AUP_AED_LOOKAHEAD_NFRM;
|
|
|
stHdl->algCtxtSz = (size_t)AUP_AED_CONTEXT_WINDOW_LEN;
|
|
|
stHdl->frmRmsBufLen = AUP_AED_MAX(1, stHdl->algDelay);
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
static int AUP_Aed_publishDynamCfg(Aed_St* stHdl) {
|
|
|
const Aed_DynamCfg* pDynmCfg;
|
|
|
PE_DynamCfg peDynmCfg;
|
|
|
if (stHdl == NULL) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
pDynmCfg = (const Aed_DynamCfg*)(&(stHdl->dynamCfg));
|
|
|
stHdl->aivadResetFrmNum = pDynmCfg->resetFrameNum;
|
|
|
stHdl->voiceDecideThresh = pDynmCfg->extVoiceThr;
|
|
|
|
|
|
if (stHdl->pitchEstStPtr != NULL) {
|
|
|
peDynmCfg.voicedThr = pDynmCfg->pitchEstVoicedThr;
|
|
|
AUP_PE_setDynamCfg(stHdl->pitchEstStPtr, &peDynmCfg);
|
|
|
}
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
static int AUP_Aed_resetVariables(Aed_St* stHdl) {
|
|
|
if (stHdl == NULL) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
memset(stHdl->dynamMemPtr, 0, stHdl->dynamMemSize);
|
|
|
|
|
|
float* melFbCoef = stHdl->melFilterBankCoef;
|
|
|
size_t* melBinBuff = stHdl->melFilterBinBuff;
|
|
|
size_t i, j;
|
|
|
size_t nBins = stHdl->intNBins;
|
|
|
size_t melFbSz = stHdl->melFbSz;
|
|
|
|
|
|
stHdl->aedProcFrmCnt = 0;
|
|
|
stHdl->inputTimeFIFOIdx = 0;
|
|
|
stHdl->aivadResetCnt = 0;
|
|
|
stHdl->timeSignalPre = 0.0f;
|
|
|
stHdl->aivadScore =
|
|
|
-1.0f;
|
|
|
stHdl->aivadScorePre = -1.0f;
|
|
|
|
|
|
stHdl->pitchFreq = 0.0f;
|
|
|
|
|
|
|
|
|
float low_mel = 2595.0f * log10f(1.0f + 0.0f / 700.0f);
|
|
|
float high_mel = 2595.0f * log10f(1.0f + 8000.0f / 700.0f);
|
|
|
float mel_points = 0.0f;
|
|
|
float hz_points = 0.0f;
|
|
|
size_t idx = 0;
|
|
|
|
|
|
for (i = 0; i < melFbSz + 2; i++) {
|
|
|
mel_points = i * (high_mel - low_mel) / ((float)melFbSz + 1.0f) + low_mel;
|
|
|
hz_points = 700.0f * (powf(10.0f, mel_points / 2595.0f) - 1.0f);
|
|
|
melBinBuff[i] =
|
|
|
(size_t)((stHdl->intFftSz + 1.0f) * hz_points / (float)AUP_AED_FS);
|
|
|
if (i > 0 && melBinBuff[i] == melBinBuff[i - 1]) {
|
|
|
return -1;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
for (j = 0; j < melFbSz; j++) {
|
|
|
for (i = melBinBuff[j]; i < melBinBuff[j + 1]; i++) {
|
|
|
idx = j * nBins + i;
|
|
|
melFbCoef[idx] = (float)(i - melBinBuff[j]) /
|
|
|
(float)(melBinBuff[j + 1] - melBinBuff[j]);
|
|
|
}
|
|
|
for (i = melBinBuff[j + 1]; i < melBinBuff[j + 2]; i++) {
|
|
|
idx = j * nBins + i;
|
|
|
melFbCoef[idx] = (float)(melBinBuff[j + 2] - i) /
|
|
|
(float)(melBinBuff[j + 2] - melBinBuff[j + 1]);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
if (stHdl->pitchEstStPtr != NULL) {
|
|
|
if (AUP_PE_init(stHdl->pitchEstStPtr) < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
if (stHdl->aivadInf != NULL) {
|
|
|
stHdl->aivadInf->Reset();
|
|
|
}
|
|
|
|
|
|
if (stHdl->timeInAnalysis != NULL) {
|
|
|
if (AUP_Analyzer_init(stHdl->timeInAnalysis) < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
static int AUP_Aed_addOneCnter(int cnter) {
|
|
|
cnter++;
|
|
|
if (cnter >= 1000000000) {
|
|
|
cnter = 0;
|
|
|
}
|
|
|
return (cnter);
|
|
|
}
|
|
|
|
|
|
static void AUP_Aed_binPowerConvert(const float* src, float* tgt, int srcNBins,
|
|
|
int tgtNBins) {
|
|
|
float rate;
|
|
|
int srcIdx, tgtIdx;
|
|
|
if (srcNBins == tgtNBins) {
|
|
|
memcpy(tgt, src, sizeof(float) * tgtNBins);
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
memset(tgt, 0, sizeof(float) * tgtNBins);
|
|
|
|
|
|
rate = (float)(srcNBins - 1) / (float)(tgtNBins - 1);
|
|
|
for (tgtIdx = 0; tgtIdx < tgtNBins; tgtIdx++) {
|
|
|
srcIdx = (int)(tgtIdx * rate);
|
|
|
srcIdx = AUP_AED_MIN(srcNBins - 1, AUP_AED_MAX(srcIdx, 0));
|
|
|
tgt[tgtIdx] = src[srcIdx];
|
|
|
}
|
|
|
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
static void AUP_Aed_CalcBinPow(int nBins, const float* cmplxSpctr,
|
|
|
float* binPow) {
|
|
|
int idx, realIdx, imagIdx;
|
|
|
|
|
|
|
|
|
binPow[0] = cmplxSpctr[0] * cmplxSpctr[0];
|
|
|
|
|
|
|
|
|
binPow[nBins - 1] = cmplxSpctr[1] * cmplxSpctr[1];
|
|
|
|
|
|
for (idx = 1; idx < (nBins - 1); idx++) {
|
|
|
realIdx = idx << 1;
|
|
|
imagIdx = realIdx + 1;
|
|
|
|
|
|
binPow[idx] = cmplxSpctr[realIdx] * cmplxSpctr[realIdx] +
|
|
|
cmplxSpctr[imagIdx] * cmplxSpctr[imagIdx];
|
|
|
}
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
static int AUP_Aed_pitch_proc(void* pitchModule, const float* timeSignal,
|
|
|
size_t timeLen, const float* binPow, size_t nBins,
|
|
|
PE_OutputData* pOut) {
|
|
|
PE_InputData peInData;
|
|
|
|
|
|
peInData.timeSignal = timeSignal;
|
|
|
peInData.hopSz = (int)timeLen;
|
|
|
peInData.inBinPow = binPow;
|
|
|
peInData.nBins = (int)nBins;
|
|
|
pOut->pitchFreq = 0;
|
|
|
pOut->voiced = -1;
|
|
|
return AUP_PE_proc(pitchModule, &peInData, pOut);
|
|
|
}
|
|
|
|
|
|
static int AUP_Aed_aivad_proc(Aed_St* stHdl, const float* inBinPow,
|
|
|
float* aivadScore) {
|
|
|
if (stHdl == NULL || inBinPow == NULL || aivadScore == NULL) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
size_t i, j;
|
|
|
size_t nBins = stHdl->intNBins;
|
|
|
size_t melFbSz = stHdl->melFbSz;
|
|
|
size_t srcOffset;
|
|
|
size_t srcLen;
|
|
|
|
|
|
float* aivadInputFeatStack = stHdl->aivadInputFeatStack;
|
|
|
float* melFbCoef = stHdl->melFilterBankCoef;
|
|
|
const float* aivadFeatMean = AUP_AED_FEATURE_MEANS;
|
|
|
const float* aivadFeatStd = AUP_AED_FEATURE_STDS;
|
|
|
float* curMelFbCoefPtr = NULL;
|
|
|
float* curInputFeatPtr = NULL;
|
|
|
float perBandValue = 0.0f;
|
|
|
float powerNormal = 32768.0f * 32768.0f;
|
|
|
|
|
|
|
|
|
srcOffset = stHdl->feaSz;
|
|
|
srcLen = (stHdl->algCtxtSz - 1) * stHdl->feaSz;
|
|
|
memmove(aivadInputFeatStack, aivadInputFeatStack + srcOffset,
|
|
|
sizeof(float) * srcLen);
|
|
|
curInputFeatPtr = aivadInputFeatStack + srcLen;
|
|
|
|
|
|
|
|
|
for (i = 0; i < melFbSz; i++) {
|
|
|
perBandValue = 0.0f;
|
|
|
curMelFbCoefPtr = melFbCoef + i * nBins;
|
|
|
for (j = 0; j < nBins; j++) {
|
|
|
perBandValue += (inBinPow[j] * curMelFbCoefPtr[j]);
|
|
|
}
|
|
|
perBandValue = perBandValue / powerNormal;
|
|
|
perBandValue = logf(perBandValue + AUP_AED_EPS);
|
|
|
curInputFeatPtr[i] =
|
|
|
(perBandValue - aivadFeatMean[i]) / (aivadFeatStd[i] + AUP_AED_EPS);
|
|
|
}
|
|
|
|
|
|
|
|
|
for (i = melFbSz; i < stHdl->feaSz; i++) {
|
|
|
curInputFeatPtr[i] =
|
|
|
(stHdl->pitchFreq - aivadFeatMean[i]) / (aivadFeatStd[i] + AUP_AED_EPS);
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
float aivadOutput;
|
|
|
if (stHdl->aivadInf != NULL &&
|
|
|
stHdl->aivadInf->Process(stHdl->aivadInputFeatStack, &aivadOutput) != 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
(*aivadScore) = aivadOutput;
|
|
|
|
|
|
stHdl->aivadResetCnt += 1;
|
|
|
if (stHdl->aivadResetCnt >= stHdl->aivadResetFrmNum) {
|
|
|
if (stHdl->aivadInf != NULL && stHdl->aivadInf->Reset() != 0) {
|
|
|
}
|
|
|
stHdl->aivadResetCnt = 0;
|
|
|
}
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
static int AUP_Aed_dynamMemPrepare(Aed_St* stHdl, void* memPtrExt,
|
|
|
size_t memSize) {
|
|
|
if (stHdl == NULL) {
|
|
|
return -1;
|
|
|
}
|
|
|
size_t pitchInNBins = stHdl->intNBins;
|
|
|
size_t totalMemSize = 0;
|
|
|
size_t inputTimeFIFOMemSize = 0;
|
|
|
size_t inputEmphTimeFIFOMemSize = 0;
|
|
|
size_t aivadInputCmplxSptrmMemSize = 0;
|
|
|
size_t aivadInputBinPowMemSize = 0;
|
|
|
size_t frameRmsBuffMemSize = 0;
|
|
|
size_t aivadInputFeatStackMemSize = 0;
|
|
|
size_t aimdInputFeatStackMemSize = 0;
|
|
|
size_t melFilterBankCoefMemSize = 0;
|
|
|
size_t melFilterBinBuffMemSize = 0;
|
|
|
size_t inputFloatBuffMemSize = 0;
|
|
|
|
|
|
|
|
|
char* memPtr = NULL;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inputTimeFIFOMemSize =
|
|
|
AUP_AED_ALIGN8(sizeof(float) * stHdl->inputTimeFIFOLen);
|
|
|
totalMemSize += inputTimeFIFOMemSize;
|
|
|
|
|
|
inputEmphTimeFIFOMemSize =
|
|
|
AUP_AED_ALIGN8(sizeof(float) * stHdl->inputTimeFIFOLen);
|
|
|
totalMemSize += inputEmphTimeFIFOMemSize;
|
|
|
|
|
|
aivadInputCmplxSptrmMemSize = AUP_AED_ALIGN8(sizeof(float) * stHdl->intFftSz);
|
|
|
totalMemSize += aivadInputCmplxSptrmMemSize;
|
|
|
|
|
|
aivadInputBinPowMemSize = AUP_AED_ALIGN8(sizeof(float) * stHdl->intNBins);
|
|
|
totalMemSize += aivadInputBinPowMemSize;
|
|
|
|
|
|
aivadInputFeatStackMemSize =
|
|
|
AUP_AED_ALIGN8(sizeof(float) * stHdl->algCtxtSz * stHdl->feaSz);
|
|
|
totalMemSize += aivadInputFeatStackMemSize;
|
|
|
|
|
|
aimdInputFeatStackMemSize =
|
|
|
AUP_AED_ALIGN8(sizeof(float) * stHdl->algCtxtSz * stHdl->feaSz);
|
|
|
totalMemSize += aimdInputFeatStackMemSize;
|
|
|
|
|
|
melFilterBankCoefMemSize =
|
|
|
AUP_AED_ALIGN8(sizeof(float) * pitchInNBins * stHdl->feaSz);
|
|
|
totalMemSize += melFilterBankCoefMemSize;
|
|
|
|
|
|
melFilterBinBuffMemSize = AUP_AED_ALIGN8(sizeof(size_t) * (stHdl->feaSz + 2));
|
|
|
totalMemSize += melFilterBinBuffMemSize;
|
|
|
|
|
|
frameRmsBuffMemSize = AUP_AED_ALIGN8(stHdl->frmRmsBufLen * sizeof(float));
|
|
|
totalMemSize += frameRmsBuffMemSize;
|
|
|
|
|
|
inputFloatBuffMemSize = AUP_AED_ALIGN8(stHdl->extHopSz * sizeof(float));
|
|
|
totalMemSize += inputFloatBuffMemSize;
|
|
|
|
|
|
if (memPtrExt == NULL) {
|
|
|
return ((int)totalMemSize);
|
|
|
}
|
|
|
|
|
|
if (totalMemSize > memSize) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
memPtr = (char*)memPtrExt;
|
|
|
|
|
|
stHdl->inputTimeFIFO = (float*)memPtr;
|
|
|
memPtr += inputTimeFIFOMemSize;
|
|
|
|
|
|
stHdl->inputEmphTimeFIFO = (float*)memPtr;
|
|
|
memPtr += inputEmphTimeFIFOMemSize;
|
|
|
|
|
|
stHdl->aivadInputCmplxSptrm = (float*)memPtr;
|
|
|
memPtr += aivadInputCmplxSptrmMemSize;
|
|
|
|
|
|
stHdl->aivadInputBinPow = (float*)memPtr;
|
|
|
memPtr += aivadInputBinPowMemSize;
|
|
|
|
|
|
stHdl->aivadInputFeatStack = (float*)memPtr;
|
|
|
memPtr += aivadInputFeatStackMemSize;
|
|
|
|
|
|
stHdl->melFilterBankCoef = (float*)memPtr;
|
|
|
memPtr += melFilterBankCoefMemSize;
|
|
|
|
|
|
stHdl->melFilterBinBuff = (size_t*)memPtr;
|
|
|
memPtr += melFilterBinBuffMemSize;
|
|
|
|
|
|
stHdl->frameRmsBuff = (float*)memPtr;
|
|
|
memPtr += frameRmsBuffMemSize;
|
|
|
|
|
|
stHdl->inputFloatBuff = (float*)memPtr;
|
|
|
memPtr += inputFloatBuffMemSize;
|
|
|
|
|
|
if (((size_t)(memPtr - (char*)memPtrExt)) > totalMemSize) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
return ((int)totalMemSize);
|
|
|
}
|
|
|
|
|
|
static int AUP_Aed_runOneFrm(Aed_St* stHdl, const float* tSignal, int hopSz,
|
|
|
const float* binPowPtr, int nBins) {
|
|
|
PE_OutputData peOutData = {0, 0};
|
|
|
float aivadScore = -1.0f;
|
|
|
float mediaFilterout = 0;
|
|
|
int mediaIdx = (int)(AUP_AED_OUTPUT_SMOOTH_FILTER_LEN) / 2;
|
|
|
int i;
|
|
|
|
|
|
if (AUP_Aed_pitch_proc(stHdl->pitchEstStPtr, tSignal, hopSz, binPowPtr, nBins,
|
|
|
&peOutData) < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
stHdl->pitchFreq = peOutData.pitchFreq;
|
|
|
if (AUP_Aed_aivad_proc(stHdl, binPowPtr, &aivadScore) < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
stHdl->aivadScore = aivadScore;
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int AUP_Aed_create(void** stPtr) {
|
|
|
if (stPtr == NULL) {
|
|
|
return -1;
|
|
|
}
|
|
|
Aed_St* tmpPtr = (Aed_St*)malloc(sizeof(Aed_St));
|
|
|
if (tmpPtr == NULL) {
|
|
|
return -1;
|
|
|
}
|
|
|
memset(tmpPtr, 0, sizeof(Aed_St));
|
|
|
|
|
|
if (AUP_PE_create(&(tmpPtr->pitchEstStPtr)) < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
if (AUP_Analyzer_create(&(tmpPtr->timeInAnalysis)) < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
tmpPtr->stCfg.enableFlag = 1;
|
|
|
tmpPtr->stCfg.fftSz = 1024;
|
|
|
tmpPtr->stCfg.hopSz = 256;
|
|
|
tmpPtr->stCfg.anaWindowSz = 768;
|
|
|
tmpPtr->stCfg.frqInputAvailableFlag = 0;
|
|
|
|
|
|
tmpPtr->dynamCfg.extVoiceThr = 0.5f;
|
|
|
tmpPtr->dynamCfg.extMusicThr = 0.5f;
|
|
|
tmpPtr->dynamCfg.extEnergyThr = 10.0f;
|
|
|
tmpPtr->dynamCfg.resetFrameNum = 1875;
|
|
|
tmpPtr->dynamCfg.pitchEstVoicedThr = AUP_AED_PITCH_EST_DEFAULT_VOICEDTHR;
|
|
|
|
|
|
(*stPtr) = (void*)tmpPtr;
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
int AUP_Aed_destroy(void** stPtr) {
|
|
|
if (stPtr == NULL || (*stPtr) == NULL) {
|
|
|
return -1;
|
|
|
}
|
|
|
Aed_St* stHdl = (Aed_St*)(*stPtr);
|
|
|
|
|
|
if (stHdl->aivadInf != NULL) {
|
|
|
delete stHdl->aivadInf;
|
|
|
}
|
|
|
stHdl->aivadInf = NULL;
|
|
|
|
|
|
if (AUP_PE_destroy(&(stHdl->pitchEstStPtr)) < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
if (AUP_Analyzer_destroy(&(stHdl->timeInAnalysis)) < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
if (stHdl->dynamMemPtr != NULL) {
|
|
|
free(stHdl->dynamMemPtr);
|
|
|
}
|
|
|
stHdl->dynamMemPtr = NULL;
|
|
|
|
|
|
if (stHdl != NULL) {
|
|
|
free(stHdl);
|
|
|
}
|
|
|
(*stPtr) = NULL;
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
int AUP_Aed_memAllocate(void* stPtr, const Aed_StaticCfg* pCfg) {
|
|
|
Aed_St* stHdl = (Aed_St*)(stPtr);
|
|
|
Aed_StaticCfg aedStatCfg;
|
|
|
PE_StaticCfg pitchStatCfg;
|
|
|
Analyzer_StaticCfg analyzerStatCfg;
|
|
|
int totalMemSize = 0;
|
|
|
|
|
|
if (stPtr == NULL || pCfg == NULL) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
|
|
|
memcpy(&aedStatCfg, pCfg, sizeof(Aed_StaticCfg));
|
|
|
if (AUP_Aed_checkStatCfg(&aedStatCfg) < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
memcpy(&(stHdl->stCfg), &aedStatCfg, sizeof(Aed_StaticCfg));
|
|
|
|
|
|
|
|
|
|
|
|
if (AUP_Aed_publishStaticCfg(stHdl) < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
|
|
|
if (stHdl->aivadInf == NULL) {
|
|
|
stHdl->aivadInf = new AUP_MODULE_AIVAD("onnx_model/ten-vad.onnx");
|
|
|
if (stHdl->aivadInf == NULL) {
|
|
|
return -1;
|
|
|
}
|
|
|
}
|
|
|
stHdl->aivadInf->Reset();
|
|
|
|
|
|
|
|
|
if (AUP_PE_getStaticCfg(stHdl->pitchEstStPtr, &pitchStatCfg) < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
pitchStatCfg.fftSz = stHdl->intFftSz;
|
|
|
pitchStatCfg.anaWindowSz = stHdl->intWinSz;
|
|
|
pitchStatCfg.hopSz = stHdl->intHopSz;
|
|
|
pitchStatCfg.useLPCPreFiltering = AUP_AED_PITCH_EST_USE_LPC;
|
|
|
pitchStatCfg.procFs = AUP_AED_PITCH_EST_PROCFS;
|
|
|
if (AUP_PE_memAllocate(stHdl->pitchEstStPtr, &pitchStatCfg) < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
|
|
|
AUP_Analyzer_getStaticCfg(stHdl->timeInAnalysis, &analyzerStatCfg);
|
|
|
analyzerStatCfg.win_len = (int)stHdl->intWinSz;
|
|
|
analyzerStatCfg.hop_size = (int)stHdl->intHopSz;
|
|
|
analyzerStatCfg.fft_size = (int)stHdl->intFftSz;
|
|
|
analyzerStatCfg.ana_win_coeff = stHdl->intAnalyWindowPtr;
|
|
|
if (AUP_Analyzer_memAllocate(stHdl->timeInAnalysis, &analyzerStatCfg) < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
|
|
|
totalMemSize = AUP_Aed_dynamMemPrepare(stHdl, NULL, 0);
|
|
|
if (totalMemSize < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
|
|
|
if (totalMemSize > (int)stHdl->dynamMemSize) {
|
|
|
if (stHdl->dynamMemPtr != NULL) {
|
|
|
free(stHdl->dynamMemPtr);
|
|
|
stHdl->dynamMemPtr = NULL;
|
|
|
stHdl->dynamMemSize = 0;
|
|
|
}
|
|
|
stHdl->dynamMemPtr = malloc(totalMemSize);
|
|
|
if (stHdl->dynamMemPtr == NULL) {
|
|
|
return -1;
|
|
|
}
|
|
|
stHdl->dynamMemSize = totalMemSize;
|
|
|
}
|
|
|
memset(stHdl->dynamMemPtr, 0, stHdl->dynamMemSize);
|
|
|
|
|
|
|
|
|
if (AUP_Aed_dynamMemPrepare(stHdl, stHdl->dynamMemPtr, stHdl->dynamMemSize) <
|
|
|
0) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
|
|
|
if (AUP_Aed_publishDynamCfg(stHdl) < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
int AUP_Aed_init(void* stPtr) {
|
|
|
Aed_St* stHdl = (Aed_St*)(stPtr);
|
|
|
if (stPtr == NULL) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
|
|
|
if (AUP_Aed_publishDynamCfg(stHdl) < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
|
|
|
if (AUP_Aed_resetVariables(stHdl) < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
int AUP_Aed_setDynamCfg(void* stPtr, const Aed_DynamCfg* pCfg) {
|
|
|
Aed_St* stHdl = (Aed_St*)(stPtr);
|
|
|
|
|
|
if (stPtr == NULL || pCfg == NULL) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
memcpy(&(stHdl->dynamCfg), pCfg, sizeof(Aed_DynamCfg));
|
|
|
|
|
|
|
|
|
if (AUP_Aed_publishDynamCfg(stHdl) < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
int AUP_Aed_getStaticCfg(const void* stPtr, Aed_StaticCfg* pCfg) {
|
|
|
const Aed_St* stHdl = (const Aed_St*)(stPtr);
|
|
|
|
|
|
if (stPtr == NULL || pCfg == NULL) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
memcpy(pCfg, &(stHdl->stCfg), sizeof(Aed_StaticCfg));
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
int AUP_Aed_getDynamCfg(const void* stPtr, Aed_DynamCfg* pCfg) {
|
|
|
const Aed_St* stHdl = (const Aed_St*)(stPtr);
|
|
|
|
|
|
if (stPtr == NULL || pCfg == NULL) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
memcpy(pCfg, &(stHdl->dynamCfg), sizeof(Aed_DynamCfg));
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
int AUP_Aed_getAlgDelay(const void* stPtr, int* delayInFrms) {
|
|
|
const Aed_St* stHdl = (const Aed_St*)(stPtr);
|
|
|
|
|
|
if (stPtr == NULL || delayInFrms == NULL) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
(*delayInFrms) = (int)stHdl->algDelay;
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
int AUP_Aed_proc(void* stPtr, const Aed_InputData* pIn, Aed_OutputData* pOut) {
|
|
|
Analyzer_InputData analyzerInput;
|
|
|
Analyzer_OutputData analyzerOutput;
|
|
|
Aed_St* stHdl = (Aed_St*)(stPtr);
|
|
|
|
|
|
const float* binPowPtr = NULL;
|
|
|
float frameRms = 0.0f;
|
|
|
float frameEnergy = 0.0f;
|
|
|
float powerNormal = 32768.0f * 32768.0f;
|
|
|
int idx;
|
|
|
|
|
|
if (stPtr == NULL) {
|
|
|
return -1;
|
|
|
}
|
|
|
if (stHdl->stCfg.enableFlag == 0) {
|
|
|
return 0;
|
|
|
}
|
|
|
if (pIn == NULL || pIn->timeSignal == NULL || pOut == NULL) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
if (stHdl->intAnalyFlag != 2) {
|
|
|
if (pIn->binPower == NULL) {
|
|
|
return -1;
|
|
|
}
|
|
|
if (pIn->nBins != (int)((stHdl->stCfg.fftSz >> 1) + 1) ||
|
|
|
pIn->hopSz != (int)(stHdl->stCfg.hopSz)) {
|
|
|
return -1;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
for (idx = 0; idx < pIn->hopSz; idx++) {
|
|
|
frameRms += (pIn->timeSignal[idx] * pIn->timeSignal[idx]);
|
|
|
}
|
|
|
frameEnergy = frameRms;
|
|
|
frameRms = sqrtf(frameRms / (float)pIn->hopSz);
|
|
|
memmove(stHdl->frameRmsBuff, stHdl->frameRmsBuff + 1,
|
|
|
sizeof(float) * (stHdl->frmRmsBufLen - 1));
|
|
|
stHdl->frameRmsBuff[stHdl->frmRmsBufLen - 1] = frameRms;
|
|
|
|
|
|
|
|
|
if ((stHdl->inputTimeFIFOIdx + pIn->hopSz) > (int)stHdl->inputTimeFIFOLen) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
|
|
|
float* timeSigEphaPtr = stHdl->inputEmphTimeFIFO + stHdl->inputTimeFIFOIdx;
|
|
|
for (idx = 0; idx < pIn->hopSz; idx++) {
|
|
|
timeSigEphaPtr[idx] = pIn->timeSignal[idx] - 0.97f * stHdl->timeSignalPre;
|
|
|
stHdl->timeSignalPre = pIn->timeSignal[idx];
|
|
|
}
|
|
|
|
|
|
memcpy(stHdl->inputTimeFIFO + stHdl->inputTimeFIFOIdx, pIn->timeSignal,
|
|
|
sizeof(float) * (pIn->hopSz));
|
|
|
stHdl->inputTimeFIFOIdx += pIn->hopSz;
|
|
|
|
|
|
if (stHdl->intAnalyFlag == 0) {
|
|
|
if (stHdl->inputTimeFIFOIdx != (int)(stHdl->intHopSz) ||
|
|
|
(int)(stHdl->intNBins) != pIn->nBins) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
|
|
|
stHdl->aedProcFrmCnt = AUP_Aed_addOneCnter(stHdl->aedProcFrmCnt);
|
|
|
binPowPtr = pIn->binPower;
|
|
|
|
|
|
|
|
|
if (AUP_Aed_runOneFrm(stHdl, stHdl->inputTimeFIFO, (int)stHdl->intHopSz,
|
|
|
binPowPtr, (int)stHdl->intNBins) < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
|
|
|
stHdl->inputTimeFIFOIdx = 0;
|
|
|
} else if (stHdl->intAnalyFlag ==
|
|
|
1) {
|
|
|
if (stHdl->inputTimeFIFOIdx != (int)(stHdl->intHopSz) ||
|
|
|
(int)(stHdl->extNBins) != pIn->nBins) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
|
|
|
stHdl->aedProcFrmCnt = AUP_Aed_addOneCnter(stHdl->aedProcFrmCnt);
|
|
|
AUP_Aed_binPowerConvert(pIn->binPower, stHdl->aivadInputBinPow,
|
|
|
(int)stHdl->extNBins, (int)stHdl->intNBins);
|
|
|
binPowPtr = stHdl->aivadInputBinPow;
|
|
|
|
|
|
|
|
|
if (AUP_Aed_runOneFrm(stHdl, stHdl->inputTimeFIFO, (int)stHdl->intHopSz,
|
|
|
binPowPtr, (int)stHdl->intNBins) < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
|
|
|
stHdl->inputTimeFIFOIdx = 0;
|
|
|
} else {
|
|
|
if (stHdl->timeInAnalysis == NULL) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
|
|
|
while (stHdl->inputTimeFIFOIdx >= (int)stHdl->intHopSz) {
|
|
|
stHdl->aedProcFrmCnt = AUP_Aed_addOneCnter(stHdl->aedProcFrmCnt);
|
|
|
|
|
|
analyzerInput.input = stHdl->inputEmphTimeFIFO;
|
|
|
analyzerInput.iLength = (int)stHdl->intHopSz;
|
|
|
analyzerOutput.output = stHdl->aivadInputCmplxSptrm;
|
|
|
analyzerOutput.oLength = (int)stHdl->intFftSz;
|
|
|
if (AUP_Analyzer_proc(stHdl->timeInAnalysis, &analyzerInput,
|
|
|
&analyzerOutput) < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
AUP_Aed_CalcBinPow((int)stHdl->intNBins, stHdl->aivadInputCmplxSptrm,
|
|
|
stHdl->aivadInputBinPow);
|
|
|
binPowPtr = stHdl->aivadInputBinPow;
|
|
|
|
|
|
|
|
|
if (AUP_Aed_runOneFrm(stHdl, stHdl->inputTimeFIFO, (int)stHdl->intHopSz,
|
|
|
binPowPtr, (int)stHdl->intNBins) < 0) {
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
|
|
|
if (stHdl->inputTimeFIFOIdx > (int)stHdl->intHopSz) {
|
|
|
memcpy(stHdl->inputTimeFIFO, stHdl->inputTimeFIFO + stHdl->intHopSz,
|
|
|
sizeof(float) * (stHdl->inputTimeFIFOIdx - stHdl->intHopSz));
|
|
|
memcpy(stHdl->inputEmphTimeFIFO,
|
|
|
stHdl->inputEmphTimeFIFO + stHdl->intHopSz,
|
|
|
sizeof(float) * (stHdl->inputTimeFIFOIdx - stHdl->intHopSz));
|
|
|
}
|
|
|
stHdl->inputTimeFIFOIdx -= (int)stHdl->intHopSz;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
pOut->frameEnergy = frameEnergy / powerNormal;
|
|
|
pOut->frameRms = stHdl->frameRmsBuff[0];
|
|
|
pOut->pitchFreq = stHdl->pitchFreq;
|
|
|
pOut->voiceProb = stHdl->aivadScore;
|
|
|
if (pOut->voiceProb < 0.0f) {
|
|
|
pOut->vadRes = -1;
|
|
|
} else if (pOut->voiceProb <= stHdl->voiceDecideThresh) {
|
|
|
pOut->vadRes = 0;
|
|
|
} else {
|
|
|
pOut->vadRes = 1;
|
|
|
}
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
|