Spaces:
Running
Running
Upload 6 files
Browse files- MicrophoneAudio.js +129 -0
- Silero.js +165 -0
- SpeechChunks.js +144 -0
- VoiceActivityDetector.js +129 -0
- index.html +85 -19
- silero_vad.onnx +3 -0
MicrophoneAudio.js
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
interface MicrophoneAudioOptions {
|
| 2 |
+
sampleRate?: number;
|
| 3 |
+
channels?: number;
|
| 4 |
+
windowSizeSamples: number;
|
| 5 |
+
onAudioData: (audioData: Float32Array) => void;
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
class MicrophoneAudio {
|
| 9 |
+
private stream: MediaStream | null = null;
|
| 10 |
+
private audioContext: AudioContext | null = null;
|
| 11 |
+
private sourceNode: MediaStreamAudioSourceNode | null = null;
|
| 12 |
+
private workletNode: AudioWorkletNode | null = null;
|
| 13 |
+
private options: MicrophoneAudioOptions;
|
| 14 |
+
private buffer: Float32Array = new Float32Array();
|
| 15 |
+
|
| 16 |
+
constructor(options: MicrophoneAudioOptions) {
|
| 17 |
+
console.log('Initializing MicrophoneAudio');
|
| 18 |
+
this.options = {
|
| 19 |
+
sampleRate: 16000,
|
| 20 |
+
channels: 1,
|
| 21 |
+
...options,
|
| 22 |
+
};
|
| 23 |
+
console.log(`MicrophoneAudio options: ${JSON.stringify(this.options)}`);
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
getDeviceId(): Promise<string> {
|
| 27 |
+
console.log('Getting device ID');
|
| 28 |
+
return navigator.mediaDevices.getUserMedia({ audio: true }).then((stream) => {
|
| 29 |
+
const deviceId = stream.getTracks()[0].getSettings().deviceId;
|
| 30 |
+
console.log("The device Id is", deviceId);
|
| 31 |
+
return deviceId;
|
| 32 |
+
});
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
async start(): Promise<void> {
|
| 36 |
+
console.log('Starting MicrophoneAudio');
|
| 37 |
+
try {
|
| 38 |
+
this.stream = await navigator.mediaDevices.getUserMedia({
|
| 39 |
+
audio: {
|
| 40 |
+
sampleRate: this.options.sampleRate,
|
| 41 |
+
channelCount: this.options.channels,
|
| 42 |
+
},
|
| 43 |
+
});
|
| 44 |
+
console.log('MediaStream acquired');
|
| 45 |
+
|
| 46 |
+
this.getDeviceId().then((deviceId) => {
|
| 47 |
+
console.log("The device Id is", deviceId);
|
| 48 |
+
});
|
| 49 |
+
this.audioContext = new AudioContext({
|
| 50 |
+
sampleRate: this.options.sampleRate,
|
| 51 |
+
});
|
| 52 |
+
|
| 53 |
+
await this.audioContext.audioWorklet.addModule(
|
| 54 |
+
URL.createObjectURL(new Blob([`
|
| 55 |
+
class AudioProcessor extends AudioWorkletProcessor {
|
| 56 |
+
constructor() {
|
| 57 |
+
super();
|
| 58 |
+
this.buffer = new Float32Array();
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
process(inputs, outputs, parameters) {
|
| 62 |
+
const input = inputs[0];
|
| 63 |
+
const channelData = input[0];
|
| 64 |
+
|
| 65 |
+
this.buffer = Float32Array.from([...this.buffer, ...channelData]);
|
| 66 |
+
|
| 67 |
+
while (this.buffer.length >= ${this.options.windowSizeSamples}) {
|
| 68 |
+
const chunk = this.buffer.slice(0, ${this.options.windowSizeSamples});
|
| 69 |
+
this.port.postMessage(chunk);
|
| 70 |
+
this.buffer = this.buffer.slice(${this.options.windowSizeSamples});
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
return true;
|
| 74 |
+
}
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
registerProcessor('audio-processor', AudioProcessor);
|
| 78 |
+
`], { type: 'application/javascript' }))
|
| 79 |
+
);
|
| 80 |
+
|
| 81 |
+
this.sourceNode = this.audioContext.createMediaStreamSource(this.stream);
|
| 82 |
+
this.workletNode = new AudioWorkletNode(this.audioContext, 'audio-processor');
|
| 83 |
+
|
| 84 |
+
this.workletNode.port.onmessage = (event) => {
|
| 85 |
+
this.options.onAudioData(event.data);
|
| 86 |
+
};
|
| 87 |
+
|
| 88 |
+
this.sourceNode.connect(this.workletNode);
|
| 89 |
+
this.workletNode.connect(this.audioContext.destination);
|
| 90 |
+
console.log('AudioWorklet added and connected');
|
| 91 |
+
} catch (error) {
|
| 92 |
+
console.error('Error starting microphone:', error);
|
| 93 |
+
throw error;
|
| 94 |
+
}
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
stop(): void {
|
| 98 |
+
console.log('Stopping MicrophoneAudio');
|
| 99 |
+
if (this.workletNode) {
|
| 100 |
+
this.workletNode.port.postMessage('flush');
|
| 101 |
+
this.workletNode.disconnect();
|
| 102 |
+
this.workletNode = null;
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
if (this.sourceNode) {
|
| 106 |
+
this.sourceNode.disconnect();
|
| 107 |
+
this.sourceNode = null;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
if (this.audioContext) {
|
| 111 |
+
this.audioContext.close();
|
| 112 |
+
this.audioContext = null;
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
if (this.stream) {
|
| 116 |
+
this.stream.getTracks().forEach((track) => track.stop());
|
| 117 |
+
this.stream = null;
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
// Send any remaining data in the buffer
|
| 121 |
+
if (this.buffer.length > 0) {
|
| 122 |
+
this.options.onAudioData(this.buffer);
|
| 123 |
+
this.buffer = new Float32Array();
|
| 124 |
+
}
|
| 125 |
+
console.log('MicrophoneAudio stopped');
|
| 126 |
+
}
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
export default MicrophoneAudio;
|
Silero.js
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import * as ort from 'onnxruntime-web';
|
| 2 |
+
|
| 3 |
+
class OnnxWrapper {
|
| 4 |
+
private session: ort.InferenceSession;
|
| 5 |
+
private _state: number[][];
|
| 6 |
+
private _context: number[];
|
| 7 |
+
private _last_sr: number;
|
| 8 |
+
private _last_batch_size: number;
|
| 9 |
+
private sample_rates: number[];
|
| 10 |
+
private sessionReady: Promise<void>;
|
| 11 |
+
|
| 12 |
+
constructor(path: string, force_onnx_cpu: boolean = true) {
|
| 13 |
+
console.log(`Initializing OnnxWrapper with path: ${path}`);
|
| 14 |
+
this.sessionReady = this.initSession(path, force_onnx_cpu);
|
| 15 |
+
this.resetStates();
|
| 16 |
+
this.sample_rates = [8000, 16000];
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
async ready(): Promise<void> {
|
| 20 |
+
console.log('Waiting for OnnxWrapper session to be ready');
|
| 21 |
+
await this.sessionReady;
|
| 22 |
+
console.log('OnnxWrapper session is ready');
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
private async initSession(path: string, force_onnx_cpu: boolean) {
|
| 26 |
+
console.log(`Initializing ONNX session with force_onnx_cpu: ${force_onnx_cpu}`);
|
| 27 |
+
const options: ort.InferenceSession.SessionOptions = {
|
| 28 |
+
executionProviders: force_onnx_cpu ? ['wasm'] : ['webgl', 'wasm'],
|
| 29 |
+
graphOptimizationLevel: 'all',
|
| 30 |
+
executionMode: 'sequential',
|
| 31 |
+
enableCpuMemArena: true,
|
| 32 |
+
enableMemPattern: true,
|
| 33 |
+
extra: {
|
| 34 |
+
session: {
|
| 35 |
+
intra_op_num_threads: 1,
|
| 36 |
+
inter_op_num_threads: 1,
|
| 37 |
+
}
|
| 38 |
+
}
|
| 39 |
+
};
|
| 40 |
+
|
| 41 |
+
this.session = await ort.InferenceSession.create(path, options);
|
| 42 |
+
console.log('ONNX session created successfully');
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
private _validate_input(x: number[][], sr: number): [number[][], number] {
|
| 46 |
+
if (!Array.isArray(x[0])) {
|
| 47 |
+
x = [x as unknown as number[]];
|
| 48 |
+
}
|
| 49 |
+
if (x.length > 2) {
|
| 50 |
+
throw new Error(`Too many dimensions for input audio chunk ${x.length}`);
|
| 51 |
+
}
|
| 52 |
+
if (sr !== 16000 && (sr % 16000 === 0)) {
|
| 53 |
+
const step = Math.floor(sr / 16000);
|
| 54 |
+
x = x.map(row => row.filter((_, i) => i % step === 0));
|
| 55 |
+
sr = 16000;
|
| 56 |
+
}
|
| 57 |
+
if (!this.sample_rates.includes(sr)) {
|
| 58 |
+
throw new Error(`Supported sampling rates: ${this.sample_rates} (or multiply of 16000)`);
|
| 59 |
+
}
|
| 60 |
+
if (sr / x[0].length > 31.25) {
|
| 61 |
+
throw new Error("Input audio chunk is too short");
|
| 62 |
+
}
|
| 63 |
+
return [x, sr];
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
resetStates(batch_size: number = 1): void {
|
| 67 |
+
console.log(`Resetting states with batch_size: ${batch_size}`);
|
| 68 |
+
this._state = Array(2).fill(0).map(() => Array(batch_size * 128).fill(0));
|
| 69 |
+
this._context = [];
|
| 70 |
+
this._last_sr = 0;
|
| 71 |
+
this._last_batch_size = 0;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
async call(x: number[][], sr: number): Promise<number[][]> {
|
| 75 |
+
console.log(`Calling model with input shape: [${x.length}, ${x[0].length}], sample rate: ${sr}`);
|
| 76 |
+
await this.ready();
|
| 77 |
+
[x, sr] = this._validate_input(x, sr);
|
| 78 |
+
const num_samples = sr === 16000 ? 512 : 256;
|
| 79 |
+
|
| 80 |
+
if (x[0].length !== num_samples) {
|
| 81 |
+
throw new Error(`Provided number of samples is ${x[0].length} (Supported values: 256 for 8000 sample rate, 512 for 16000)`);
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
const batch_size = x.length;
|
| 85 |
+
const context_size = sr === 16000 ? 64 : 32;
|
| 86 |
+
|
| 87 |
+
if (!this._last_batch_size) {
|
| 88 |
+
this.resetStates(batch_size);
|
| 89 |
+
}
|
| 90 |
+
if (this._last_sr && this._last_sr !== sr) {
|
| 91 |
+
this.resetStates(batch_size);
|
| 92 |
+
}
|
| 93 |
+
if (this._last_batch_size && this._last_batch_size !== batch_size) {
|
| 94 |
+
this.resetStates(batch_size);
|
| 95 |
+
}
|
| 96 |
+
if (this._context.length === 0) {
|
| 97 |
+
this._context = Array(batch_size * context_size).fill(0);
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
x = x.map((row, i) => [...this._context.slice(i * context_size, (i + 1) * context_size), ...row]);
|
| 101 |
+
|
| 102 |
+
if (sr === 8000 || sr === 16000) {
|
| 103 |
+
const inputTensor = new ort.Tensor('float32', x.flat(), [batch_size, x[0].length]);
|
| 104 |
+
const stateTensor = new ort.Tensor('float32', this._state.flat(), [2, batch_size, 128]);
|
| 105 |
+
const srTensor = new ort.Tensor('int64', [sr], []);
|
| 106 |
+
|
| 107 |
+
const feeds: Record<string, ort.Tensor> = {
|
| 108 |
+
input: inputTensor,
|
| 109 |
+
state: stateTensor,
|
| 110 |
+
sr: srTensor
|
| 111 |
+
};
|
| 112 |
+
|
| 113 |
+
const results = await this.session.run(feeds);
|
| 114 |
+
const outputData = results.output.data as Float32Array;
|
| 115 |
+
const stateData = results.stateN.data as Float32Array;
|
| 116 |
+
|
| 117 |
+
this._state = Array(2).fill(0).map((_, i) =>
|
| 118 |
+
Array.from(stateData.slice(i * batch_size * 128, (i + 1) * batch_size * 128))
|
| 119 |
+
);
|
| 120 |
+
|
| 121 |
+
const outputShape = results.output.dims as number[];
|
| 122 |
+
const out = Array(outputShape[0]).fill(0).map((_, i) =>
|
| 123 |
+
Array.from(outputData.slice(i * outputShape[1], (i + 1) * outputShape[1]))
|
| 124 |
+
);
|
| 125 |
+
|
| 126 |
+
this._context = x.map(row => row.slice(-context_size)).flat();
|
| 127 |
+
this._last_sr = sr;
|
| 128 |
+
this._last_batch_size = batch_size;
|
| 129 |
+
|
| 130 |
+
console.log(`Model call completed, output shape: [${out.length}, ${out[0].length}]`);
|
| 131 |
+
return out;
|
| 132 |
+
} else {
|
| 133 |
+
throw new Error(`Unsupported sample rate: ${sr}. Supported rates are 8000 and 16000.`);
|
| 134 |
+
}
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
async audio_forward(x: number[][], sr: number): Promise<number[][]> {
|
| 138 |
+
console.log(`Running audio_forward with input shape: [${x.length}, ${x[0].length}], sample rate: ${sr}`);
|
| 139 |
+
const outs: number[][][] = [];
|
| 140 |
+
[x, sr] = this._validate_input(x, sr);
|
| 141 |
+
this.resetStates();
|
| 142 |
+
const num_samples = sr === 16000 ? 512 : 256;
|
| 143 |
+
|
| 144 |
+
if (x[0].length % num_samples !== 0) {
|
| 145 |
+
const pad_num = num_samples - (x[0].length % num_samples);
|
| 146 |
+
x = x.map(row => [...row, ...Array(pad_num).fill(0)]);
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
for (let i = 0; i < x[0].length; i += num_samples) {
|
| 150 |
+
const wavs_batch = x.map(row => row.slice(i, i + num_samples));
|
| 151 |
+
const out_chunk = await this.call(wavs_batch, sr);
|
| 152 |
+
outs.push(out_chunk);
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
console.log(`audio_forward completed, output shape: [${outs.length}, ${outs[0].length}]`);
|
| 156 |
+
return outs.reduce((acc, curr) => acc.map((row, i) => [...row, ...curr[i]]));
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
close(): void {
|
| 160 |
+
console.log('Closing OnnxWrapper session');
|
| 161 |
+
this.session.release();
|
| 162 |
+
}
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
export default OnnxWrapper;
|
SpeechChunks.js
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import MicrophoneAudio from './MicrophoneAudio';
|
| 2 |
+
import { VadDetector } from './VoiceActivityDetector';
|
| 3 |
+
|
| 4 |
+
export class SpeechChunks {
|
| 5 |
+
private static readonly SAMPLE_RATE = 16000;
|
| 6 |
+
private static readonly START_THRESHOLD = 0.6;
|
| 7 |
+
private static readonly END_THRESHOLD = 0.45;
|
| 8 |
+
private static readonly MIN_SILENCE_DURATION_MS = 600;
|
| 9 |
+
private static readonly SPEECH_PAD_MS = 500;
|
| 10 |
+
private static readonly WINDOW_SIZE_SAMPLES = 512;
|
| 11 |
+
|
| 12 |
+
private chunks: number[][];
|
| 13 |
+
private microphoneAudio: MicrophoneAudio;
|
| 14 |
+
private vadDetector: VadDetector;
|
| 15 |
+
private isSpeechActive: boolean;
|
| 16 |
+
private onSpeechStart: () => void;
|
| 17 |
+
private onSpeechEnd: (blob: Blob) => void;
|
| 18 |
+
|
| 19 |
+
constructor(onSpeechStart, onSpeechEnd) {
|
| 20 |
+
this.chunks = [];
|
| 21 |
+
this.isSpeechActive = false;
|
| 22 |
+
|
| 23 |
+
this.microphoneAudio = new MicrophoneAudio({
|
| 24 |
+
sampleRate: SpeechChunks.SAMPLE_RATE,
|
| 25 |
+
windowSizeSamples: SpeechChunks.WINDOW_SIZE_SAMPLES,
|
| 26 |
+
onAudioData: this.processAudioData.bind(this)
|
| 27 |
+
});
|
| 28 |
+
|
| 29 |
+
this.onSpeechStart = onSpeechStart;
|
| 30 |
+
this.onSpeechEnd = onSpeechEnd;
|
| 31 |
+
|
| 32 |
+
this.vadDetector = new VadDetector(
|
| 33 |
+
SpeechChunks.START_THRESHOLD,
|
| 34 |
+
SpeechChunks.END_THRESHOLD,
|
| 35 |
+
SpeechChunks.SAMPLE_RATE,
|
| 36 |
+
SpeechChunks.MIN_SILENCE_DURATION_MS,
|
| 37 |
+
SpeechChunks.SPEECH_PAD_MS
|
| 38 |
+
);
|
| 39 |
+
|
| 40 |
+
console.log('SpeechChunks initialized');
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
private async processAudioData(audioData: Float32Array): Promise<void> {
|
| 44 |
+
console.log(`Processing audio data of length ${audioData.length}`);
|
| 45 |
+
try {
|
| 46 |
+
const result = await this.vadDetector.apply(audioData, false);
|
| 47 |
+
if (result.start !== undefined) {
|
| 48 |
+
this.isSpeechActive = true;
|
| 49 |
+
console.log('Speech start detected');
|
| 50 |
+
this.onSpeechStart();
|
| 51 |
+
} else if (result.end !== undefined) {
|
| 52 |
+
this.isSpeechActive = false;
|
| 53 |
+
console.log('Speech end detected');
|
| 54 |
+
this.onSpeechEnd(this.getBlob());
|
| 55 |
+
}
|
| 56 |
+
if (this.isSpeechActive) {
|
| 57 |
+
console.log('Adding chunk to speech');
|
| 58 |
+
this.chunks.push(Array.from(audioData));
|
| 59 |
+
}
|
| 60 |
+
} catch (error) {
|
| 61 |
+
console.error('Error processing audio data', error);
|
| 62 |
+
}
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
async start(): Promise<void> {
|
| 66 |
+
console.log('Starting SpeechChunks');
|
| 67 |
+
await this.microphoneAudio.start();
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
stop(): void {
|
| 71 |
+
console.log('Stopping SpeechChunks');
|
| 72 |
+
this.microphoneAudio.stop();
|
| 73 |
+
this.vadDetector.reset();
|
| 74 |
+
this.isSpeechActive = false;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
getSpeechChunks(): number[][] {
|
| 78 |
+
console.log(`Returning ${this.chunks.length} speech chunks`);
|
| 79 |
+
const speechChunks = this.chunks;
|
| 80 |
+
this.chunks = [];
|
| 81 |
+
return speechChunks;
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
getBlob(): Blob {
|
| 85 |
+
console.log('Creating audio blob from speech chunks');
|
| 86 |
+
// Combine all chunks into a single Float32Array
|
| 87 |
+
const combinedChunks = this.chunks;
|
| 88 |
+
const combinedLength = combinedChunks.reduce((sum, chunk) => sum + chunk.length, 0);
|
| 89 |
+
const combinedAudio = new Float32Array(combinedLength);
|
| 90 |
+
let offset = 0;
|
| 91 |
+
for (const chunk of combinedChunks) {
|
| 92 |
+
combinedAudio.set(chunk, offset);
|
| 93 |
+
offset += chunk.length;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
// Convert Float32Array to Int16Array (common format for WAV files)
|
| 97 |
+
const intData = new Int16Array(combinedAudio.length);
|
| 98 |
+
for (let i = 0; i < combinedAudio.length; i++) {
|
| 99 |
+
const s = Math.max(-1, Math.min(1, combinedAudio[i]));
|
| 100 |
+
intData[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
// Create WAV file header
|
| 104 |
+
const header = new ArrayBuffer(44);
|
| 105 |
+
const view = new DataView(header);
|
| 106 |
+
|
| 107 |
+
// RIFF chunk descriptor
|
| 108 |
+
this.writeString(view, 0, 'RIFF');
|
| 109 |
+
view.setUint32(4, 36 + intData.length * 2, true);
|
| 110 |
+
this.writeString(view, 8, 'WAVE');
|
| 111 |
+
|
| 112 |
+
// FMT sub-chunk
|
| 113 |
+
this.writeString(view, 12, 'fmt ');
|
| 114 |
+
view.setUint32(16, 16, true); // subchunk1size
|
| 115 |
+
view.setUint16(20, 1, true); // audio format (1 for PCM)
|
| 116 |
+
view.setUint16(22, 1, true); // num of channels
|
| 117 |
+
view.setUint32(24, SpeechChunks.SAMPLE_RATE, true); // sample rate
|
| 118 |
+
view.setUint32(28, SpeechChunks.SAMPLE_RATE * 2, true); // byte rate
|
| 119 |
+
view.setUint16(32, 2, true); // block align
|
| 120 |
+
view.setUint16(34, 16, true); // bits per sample
|
| 121 |
+
|
| 122 |
+
// Data sub-chunk
|
| 123 |
+
this.writeString(view, 36, 'data');
|
| 124 |
+
view.setUint32(40, intData.length * 2, true);
|
| 125 |
+
|
| 126 |
+
// Combine header and data
|
| 127 |
+
const blob = new Blob([header, intData], { type: 'audio/wav' });
|
| 128 |
+
console.log(`Created blob of size ${blob.size} bytes`);
|
| 129 |
+
return blob;
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
// Helper function to write strings to DataView
|
| 133 |
+
private writeString(view: DataView, offset: number, string: string): void {
|
| 134 |
+
for (let i = 0; i < string.length; i++) {
|
| 135 |
+
view.setUint8(offset + i, string.charCodeAt(i));
|
| 136 |
+
}
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
async close(): Promise<void> {
|
| 140 |
+
console.log('Closing SpeechChunks');
|
| 141 |
+
this.stop();
|
| 142 |
+
await this.vadDetector.close();
|
| 143 |
+
}
|
| 144 |
+
}
|
VoiceActivityDetector.js
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import OnnxWrapper from './Silero'; // Assuming you have this class implemented
|
| 2 |
+
const modelPath = process.env.VAD_MODEL_PATH;
|
| 3 |
+
|
| 4 |
+
export class VadDetector {
|
| 5 |
+
private model: OnnxWrapper;
|
| 6 |
+
private startThreshold: number;
|
| 7 |
+
private endThreshold: number;
|
| 8 |
+
private samplingRate: number;
|
| 9 |
+
private minSilenceSamples: number;
|
| 10 |
+
private speechPadSamples: number;
|
| 11 |
+
private triggered: boolean;
|
| 12 |
+
private tempEnd: number;
|
| 13 |
+
private currentSample: number;
|
| 14 |
+
|
| 15 |
+
constructor(
|
| 16 |
+
startThreshold: number,
|
| 17 |
+
endThreshold: number,
|
| 18 |
+
samplingRate: number,
|
| 19 |
+
minSilenceDurationMs: number,
|
| 20 |
+
speechPadMs: number
|
| 21 |
+
) {
|
| 22 |
+
if (samplingRate !== 8000 && samplingRate !== 16000) {
|
| 23 |
+
throw new Error("Does not support sampling rates other than [8000, 16000]");
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
this.model = new OnnxWrapper(modelPath);
|
| 27 |
+
this.startThreshold = startThreshold;
|
| 28 |
+
this.endThreshold = endThreshold;
|
| 29 |
+
this.samplingRate = samplingRate;
|
| 30 |
+
this.minSilenceSamples = samplingRate * minSilenceDurationMs / 1000;
|
| 31 |
+
this.speechPadSamples = samplingRate * speechPadMs / 1000;
|
| 32 |
+
this.reset();
|
| 33 |
+
console.log(`VadDetector initialized with: startThreshold=${startThreshold}, endThreshold=${endThreshold}, samplingRate=${samplingRate}`);
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
reset(): void {
|
| 37 |
+
this.model.resetStates();
|
| 38 |
+
this.triggered = false;
|
| 39 |
+
this.tempEnd = 0;
|
| 40 |
+
this.currentSample = 0;
|
| 41 |
+
console.log('VadDetector reset');
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
async apply(data: Float32Array, returnSeconds: boolean): Promise<{ start?: number; end?: number }> {
|
| 45 |
+
console.log(`Applying VAD to data of length ${data.length}`);
|
| 46 |
+
const windowSizeSamples = data.length;
|
| 47 |
+
this.currentSample += windowSizeSamples;
|
| 48 |
+
|
| 49 |
+
// Determine the row length based on the sampling rate
|
| 50 |
+
const rowLength = this.samplingRate === 16000 ? 512 : 256;
|
| 51 |
+
|
| 52 |
+
// Calculate the number of rows
|
| 53 |
+
const numRows = Math.ceil(data.length / rowLength);
|
| 54 |
+
|
| 55 |
+
// Create the 2D array
|
| 56 |
+
const x: number[][] = [];
|
| 57 |
+
for (let i = 0; i < numRows; i++) {
|
| 58 |
+
const start = i * rowLength;
|
| 59 |
+
const end = Math.min(start + rowLength, data.length);
|
| 60 |
+
x.push(Array.from(data.slice(start, end)));
|
| 61 |
+
|
| 62 |
+
// If the last row is not full, pad it with zeros
|
| 63 |
+
if (end - start < rowLength) {
|
| 64 |
+
x[i] = x[i].concat(new Array(rowLength - (end - start)).fill(0));
|
| 65 |
+
}
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
let speechProb: number;
|
| 69 |
+
try {
|
| 70 |
+
let speechProbPromise = await this.model.call(x, this.samplingRate);
|
| 71 |
+
if (speechProbPromise && Array.isArray(speechProbPromise) && speechProbPromise[0]) {
|
| 72 |
+
speechProb = speechProbPromise[0][0];
|
| 73 |
+
console.log(`Speech probability: ${speechProb}`);
|
| 74 |
+
} else {
|
| 75 |
+
throw new Error("Unexpected response from model");
|
| 76 |
+
}
|
| 77 |
+
} catch (e) {
|
| 78 |
+
console.error("Error in VadDetector.apply:", e);
|
| 79 |
+
throw new Error("Error calling the model: " + e);
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
if (speechProb >= this.startThreshold && this.tempEnd !== 0) {
|
| 83 |
+
this.tempEnd = 0;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
if (speechProb >= this.startThreshold && !this.triggered) {
|
| 87 |
+
this.triggered = true;
|
| 88 |
+
let speechStart = Math.max(this.currentSample - this.speechPadSamples, 0);
|
| 89 |
+
console.log(`Speech start detected at sample ${speechStart}`);
|
| 90 |
+
if (returnSeconds) {
|
| 91 |
+
const speechStartSeconds = speechStart / this.samplingRate;
|
| 92 |
+
return { start: Number(speechStartSeconds.toFixed(1)) };
|
| 93 |
+
} else {
|
| 94 |
+
return { start: speechStart };
|
| 95 |
+
}
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
if (speechProb < this.endThreshold && this.triggered) {
|
| 99 |
+
console.log(`Potential speech end at sample ${this.currentSample}`);
|
| 100 |
+
if (this.tempEnd === 0) {
|
| 101 |
+
this.tempEnd = this.currentSample;
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
if (this.currentSample - this.tempEnd < this.minSilenceSamples) {
|
| 105 |
+
console.log('Silence duration too short, continuing');
|
| 106 |
+
return {};
|
| 107 |
+
} else {
|
| 108 |
+
const speechEnd = this.tempEnd + this.speechPadSamples;
|
| 109 |
+
console.log(`Speech end confirmed at sample ${speechEnd}`);
|
| 110 |
+
this.tempEnd = 0;
|
| 111 |
+
this.triggered = false;
|
| 112 |
+
|
| 113 |
+
if (returnSeconds) {
|
| 114 |
+
const speechEndSeconds = speechEnd / this.samplingRate;
|
| 115 |
+
return { end: Number(speechEndSeconds.toFixed(1)) };
|
| 116 |
+
} else {
|
| 117 |
+
return { end: speechEnd };
|
| 118 |
+
}
|
| 119 |
+
}
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
return {};
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
async close(): Promise<void> {
|
| 126 |
+
this.reset();
|
| 127 |
+
await this.model.close();
|
| 128 |
+
}
|
| 129 |
+
}
|
index.html
CHANGED
|
@@ -1,19 +1,85 @@
|
|
| 1 |
-
<!
|
| 2 |
-
<html>
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>Voice Activity Detection Demo</title>
|
| 7 |
+
<script src="https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/ort.min.js"></script>
|
| 8 |
+
<style>
|
| 9 |
+
body {{
|
| 10 |
+
font-family: Arial, sans-serif;
|
| 11 |
+
max-width: 800px;
|
| 12 |
+
margin: 0 auto;
|
| 13 |
+
padding: 20px;
|
| 14 |
+
}}
|
| 15 |
+
#status {{
|
| 16 |
+
font-weight: bold;
|
| 17 |
+
margin-bottom: 10px;
|
| 18 |
+
}}
|
| 19 |
+
#audioList {{
|
| 20 |
+
margin-top: 20px;
|
| 21 |
+
}}
|
| 22 |
+
</style>
|
| 23 |
+
</head>
|
| 24 |
+
<body>
|
| 25 |
+
<h1>Voice Activity Detection Demo</h1>
|
| 26 |
+
<div id="status">Not listening</div>
|
| 27 |
+
<button id="startButton">Start Listening</button>
|
| 28 |
+
<button id="stopButton" disabled>Stop Listening</button>
|
| 29 |
+
<div id="audioList"></div>
|
| 30 |
+
|
| 31 |
+
<script type="module">
|
| 32 |
+
{speech_chunks_js}
|
| 33 |
+
{microphone_audio_js}
|
| 34 |
+
{silero_js}
|
| 35 |
+
{voice_activity_detector_js}
|
| 36 |
+
|
| 37 |
+
const status = document.getElementById('status');
|
| 38 |
+
const startButton = document.getElementById('startButton');
|
| 39 |
+
const stopButton = document.getElementById('stopButton');
|
| 40 |
+
const audioList = document.getElementById('audioList');
|
| 41 |
+
|
| 42 |
+
let speechChunks;
|
| 43 |
+
|
| 44 |
+
startButton.addEventListener('click', async () => {{
|
| 45 |
+
speechChunks = new SpeechChunks(
|
| 46 |
+
() => {{
|
| 47 |
+
console.log("Speech start");
|
| 48 |
+
status.textContent = "Listening...";
|
| 49 |
+
}},
|
| 50 |
+
(blob) => {{
|
| 51 |
+
console.log("Speech end");
|
| 52 |
+
status.textContent = "Not listening";
|
| 53 |
+
const audio = new Audio(URL.createObjectURL(blob));
|
| 54 |
+
const listItem = document.createElement('div');
|
| 55 |
+
listItem.appendChild(audio);
|
| 56 |
+
const playButton = document.createElement('button');
|
| 57 |
+
playButton.textContent = 'Play';
|
| 58 |
+
playButton.onclick = () => audio.play();
|
| 59 |
+
listItem.appendChild(playButton);
|
| 60 |
+
audioList.appendChild(listItem);
|
| 61 |
+
}}
|
| 62 |
+
);
|
| 63 |
+
|
| 64 |
+
try {{
|
| 65 |
+
await speechChunks.start();
|
| 66 |
+
startButton.disabled = true;
|
| 67 |
+
stopButton.disabled = false;
|
| 68 |
+
status.textContent = "Listening...";
|
| 69 |
+
}} catch (error) {{
|
| 70 |
+
console.error("Failed to start VAD:", error);
|
| 71 |
+
status.textContent = "Error starting VAD";
|
| 72 |
+
}}
|
| 73 |
+
}});
|
| 74 |
+
|
| 75 |
+
stopButton.addEventListener('click', () => {{
|
| 76 |
+
if (speechChunks) {{
|
| 77 |
+
speechChunks.stop();
|
| 78 |
+
startButton.disabled = false;
|
| 79 |
+
stopButton.disabled = true;
|
| 80 |
+
status.textContent = "Not listening";
|
| 81 |
+
}}
|
| 82 |
+
}});
|
| 83 |
+
</script>
|
| 84 |
+
</body>
|
| 85 |
+
</html>
|
silero_vad.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2623a2953f6ff3d2c1e61740c6cdb7168133479b267dfef114a4a3cc5bdd788f
|
| 3 |
+
size 2327524
|