File size: 4,836 Bytes
b50f2a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
//
// Copyright © 2025 Agora
// This file is part of TEN Framework, an open source project.
// Licensed under the Apache License, Version 2.0, with certain conditions.
// Refer to the "LICENSE" file in the root directory for more information.
//
#ifndef __AED_ST_H__
#define __AED_ST_H__

#include <stdio.h>
#include <onnxruntime_c_api.h>

#include "aed.h"

#define AUP_AED_FS (16000)
#define AUP_AED_MAX_IN_BUFF_SIZE (256)
#define AUP_AED_POWER_SPCTR_NORMALIZER (9.3132e-10f)  // = 1/(32768^2)
#define AUP_AED_OUTPUT_SMOOTH_FILTER_LEN (10)         // 160ms

#define AUP_AED_MEL_FILTER_BANK_NUM (40)
#define AUP_AED_LOOKAHEAD_NFRM (1)
#define AUP_AED_CONTEXT_WINDOW_LEN (3)  // context window length of AIVAD
#define AUP_AED_FEA_LEN \
  (AUP_AED_MEL_FILTER_BANK_NUM + 1)  // feature length of AIVAD

#define AUP_AED_PITCH_EST_USE_LPC (1)
#define AUP_AED_PITCH_EST_PROCFS (4000)
#if AUP_AED_PITCH_EST_PROCFS == 2000
#define AUP_AED_PITCH_EST_DEFAULT_VOICEDTHR (0.45f)
#else
#define AUP_AED_PITCH_EST_DEFAULT_VOICEDTHR (0.4f)
#endif

#define AUP_AED_MODEL_IO_NUM (5)
#define AUP_AED_MODEL_NAME_LENGTH (32)
#define AUP_AED_MODEL_HIDDEN_DIM (64)

class AUP_MODULE_AIVAD {
 public:
  AUP_MODULE_AIVAD(char* onnx_path);
  ~AUP_MODULE_AIVAD();
  int Process(float* input, float* output);
  int Reset();

 private:
  const OrtApi* ort_api = NULL;
  OrtAllocator* ort_allocator = NULL;
  OrtEnv* ort_env = NULL;
  OrtSession* ort_session = NULL;
  int inited = 0;
  int clear_hidden = 0;

  char input_names_buf[AUP_AED_MODEL_IO_NUM][AUP_AED_MODEL_NAME_LENGTH] = {0};
  const char* input_names[AUP_AED_MODEL_IO_NUM] = {NULL};
  float input_data_buf_0[AUP_AED_CONTEXT_WINDOW_LEN * AUP_AED_FEA_LEN] = {0};
  float input_data_buf_1234[AUP_AED_MODEL_IO_NUM - 1]
                           [AUP_AED_MODEL_HIDDEN_DIM] = {0};
  OrtValue* ort_input_tensors[AUP_AED_MODEL_IO_NUM] = {NULL};

  char output_names_buf[AUP_AED_MODEL_IO_NUM][AUP_AED_MODEL_NAME_LENGTH] = {0};
  const char* output_names[AUP_AED_MODEL_IO_NUM] = {NULL};
  OrtValue* ort_output_tensors[AUP_AED_MODEL_IO_NUM] = {NULL};
};

typedef struct Aed_St_ {
  void* dynamMemPtr;    // memory pointer holding the dynamic memory
  size_t dynamMemSize;  // size of the buffer *dynamMemPtr

  Aed_StaticCfg stCfg;

  Aed_DynamCfg dynamCfg;

  // Internal Static Config Registers, which are generated from stCfg
  size_t extFftSz;  // externally decided FFT-Sz
  size_t extHopSz;  // externally decided FFT-Hop-Sz
  size_t extNBins;  // (FFTSz/2) + 1
  size_t extWinSz;  // externally decided FFT-Window-Sz

  size_t intFftSz;                 // internal FFT Sz
  size_t intHopSz;                 // internal Hop Sz
  size_t intWinSz;                 // internal Window Sz
  size_t intNBins;                 // internal NBins
  const float* intAnalyWindowPtr;  // internal analysis pointer
  int intAnalyFlag;                // whether to do internal analysis
  // 0: directly use external spectrum
  // 1: use external spectrum with interpolation / exterpolation
  // 2: need to redo analysis based on input time-domain signal
  size_t inputTimeFIFOLen;  // length of input FIFO buffer
  // if = 0: no need for input time-domain FIFO Queue

  // Internal static config registers for pitch-est module
  size_t feaSz;
  size_t melFbSz;
  size_t algDelay;  // in terms of processing frames
  size_t algCtxtSz;
  size_t frmRmsBufLen;  // frameRmsBuff: buffer-length of frameRmsBuff (FIFO)

  // Internal dynamic Config Registers, which are generated from dynamCfg
  size_t aivadResetFrmNum;
  float voiceDecideThresh;

  // SubModules
  AUP_MODULE_AIVAD* aivadInf;

  void* pitchEstStPtr;  // pitch-estimation module handler
  void* timeInAnalysis;
  // state handler of STFT analysis module

  // Variables
  int aedProcFrmCnt;  // counter of consecutive AI-VAD processed frames
  int inputTimeFIFOIdx;
  float* inputTimeFIFO;  // [inputTimeFIFOLen]
  // input fifo buffer of time-signal to adjust between extHopSz and intHopSz
  float* inputEmphTimeFIFO;     // [inputTimeFIFOLen]
  float* aivadInputCmplxSptrm;  // [intFftSz]
  float* aivadInputBinPow;      // [intNBins]  // AIVAD input power spectrum
  size_t aivadResetCnt;
  float timeSignalPre;
  float aivadScore;
  float aivadScorePre;

  float pitchFreq;      // input audio pitch in Hz
  float* frameRmsBuff;  // [frmRmsBufLen], FIFO, to delay frmRms result so that
                        // it aligns with AIVAD result
  float* aivadInputFeatStack;  // [...] = [AUP_AED_CONTEXT_WINDOW_LEN *
                               // AUP_AED_FEA_LEN]
  float* melFilterBankCoef;    // [melFbSz][nBins]
  size_t* melFilterBinBuff;    // [melFbSz + 2]
  float* inputFloatBuff;       // [hopSz]
} Aed_St;

#endif