Spaces:

atlury
/

digitalhuman

Running

App Files Files Community

atlury commited on Sep 14, 2024

Commit

16bb6c5

verified ·

1 Parent(s): 75afa3a

Upload 6 files

Browse files

Files changed (6) hide show

MicrophoneAudio.js +129 -0
Silero.js +165 -0
SpeechChunks.js +144 -0
VoiceActivityDetector.js +129 -0
index.html +85 -19
silero_vad.onnx +3 -0

MicrophoneAudio.js ADDED Viewed

	@@ -0,0 +1,129 @@

+interface MicrophoneAudioOptions {
+  sampleRate?: number;
+  channels?: number;
+  windowSizeSamples: number;
+  onAudioData: (audioData: Float32Array) => void;
+}
+class MicrophoneAudio {
+  private stream: MediaStream | null = null;
+  private audioContext: AudioContext | null = null;
+  private sourceNode: MediaStreamAudioSourceNode | null = null;
+  private workletNode: AudioWorkletNode | null = null;
+  private options: MicrophoneAudioOptions;
+  private buffer: Float32Array = new Float32Array();
+  constructor(options: MicrophoneAudioOptions) {
+    console.log('Initializing MicrophoneAudio');
+    this.options = {
+      sampleRate: 16000,
+      channels: 1,
+      ...options,
+    };
+    console.log(`MicrophoneAudio options: ${JSON.stringify(this.options)}`);
+  }
+  getDeviceId(): Promise<string> {
+    console.log('Getting device ID');
+    return navigator.mediaDevices.getUserMedia({ audio: true }).then((stream) => {
+      const deviceId = stream.getTracks()[0].getSettings().deviceId;
+      console.log("The device Id is", deviceId);
+      return deviceId;
+    });
+  }
+  async start(): Promise<void> {
+    console.log('Starting MicrophoneAudio');
+    try {
+      this.stream = await navigator.mediaDevices.getUserMedia({
+        audio: {
+          sampleRate: this.options.sampleRate,
+          channelCount: this.options.channels,
+        },
+      });
+      console.log('MediaStream acquired');
+      this.getDeviceId().then((deviceId) => {
+        console.log("The device Id is", deviceId);
+      });
+      this.audioContext = new AudioContext({
+        sampleRate: this.options.sampleRate,
+      });
+      await this.audioContext.audioWorklet.addModule(
+        URL.createObjectURL(new Blob([`
+          class AudioProcessor extends AudioWorkletProcessor {
+            constructor() {
+              super();
+              this.buffer = new Float32Array();
+            }
+            process(inputs, outputs, parameters) {
+              const input = inputs[0];
+              const channelData = input[0];
+              this.buffer = Float32Array.from([...this.buffer, ...channelData]);
+              while (this.buffer.length >= ${this.options.windowSizeSamples}) {
+                const chunk = this.buffer.slice(0, ${this.options.windowSizeSamples});
+                this.port.postMessage(chunk);
+                this.buffer = this.buffer.slice(${this.options.windowSizeSamples});
+              }
+              return true;
+            }
+          }
+          registerProcessor('audio-processor', AudioProcessor);
+        `], { type: 'application/javascript' }))
+      );
+      this.sourceNode = this.audioContext.createMediaStreamSource(this.stream);
+      this.workletNode = new AudioWorkletNode(this.audioContext, 'audio-processor');
+      this.workletNode.port.onmessage = (event) => {
+        this.options.onAudioData(event.data);
+      };
+      this.sourceNode.connect(this.workletNode);
+      this.workletNode.connect(this.audioContext.destination);
+      console.log('AudioWorklet added and connected');
+    } catch (error) {
+      console.error('Error starting microphone:', error);
+      throw error;
+    }
+  }
+  stop(): void {
+    console.log('Stopping MicrophoneAudio');
+    if (this.workletNode) {
+      this.workletNode.port.postMessage('flush');
+      this.workletNode.disconnect();
+      this.workletNode = null;
+    }
+    if (this.sourceNode) {
+      this.sourceNode.disconnect();
+      this.sourceNode = null;
+    }
+    if (this.audioContext) {
+      this.audioContext.close();
+      this.audioContext = null;
+    }
+    if (this.stream) {
+      this.stream.getTracks().forEach((track) => track.stop());
+      this.stream = null;
+    }
+    // Send any remaining data in the buffer
+    if (this.buffer.length > 0) {
+      this.options.onAudioData(this.buffer);
+      this.buffer = new Float32Array();
+    }
+    console.log('MicrophoneAudio stopped');
+  }
+}
+export default MicrophoneAudio;

Silero.js ADDED Viewed

	@@ -0,0 +1,165 @@

+import * as ort from 'onnxruntime-web';
+class OnnxWrapper {
+    private session: ort.InferenceSession;
+    private _state: number[][];
+    private _context: number[];
+    private _last_sr: number;
+    private _last_batch_size: number;
+    private sample_rates: number[];
+    private sessionReady: Promise<void>;
+    constructor(path: string, force_onnx_cpu: boolean = true) {
+        console.log(`Initializing OnnxWrapper with path: ${path}`);
+        this.sessionReady = this.initSession(path, force_onnx_cpu);
+        this.resetStates();
+        this.sample_rates = [8000, 16000];
+    }
+    async ready(): Promise<void> {
+        console.log('Waiting for OnnxWrapper session to be ready');
+        await this.sessionReady;
+        console.log('OnnxWrapper session is ready');
+    }
+    private async initSession(path: string, force_onnx_cpu: boolean) {
+        console.log(`Initializing ONNX session with force_onnx_cpu: ${force_onnx_cpu}`);
+        const options: ort.InferenceSession.SessionOptions = {
+            executionProviders: force_onnx_cpu ? ['wasm'] : ['webgl', 'wasm'],
+            graphOptimizationLevel: 'all',
+            executionMode: 'sequential',
+            enableCpuMemArena: true,
+            enableMemPattern: true,
+            extra: {
+                session: {
+                    intra_op_num_threads: 1,
+                    inter_op_num_threads: 1,
+                }
+            }
+        };
+        this.session = await ort.InferenceSession.create(path, options);
+        console.log('ONNX session created successfully');
+    }
+    private _validate_input(x: number[][], sr: number): [number[][], number] {
+        if (!Array.isArray(x[0])) {
+            x = [x as unknown as number[]];
+        }
+        if (x.length > 2) {
+            throw new Error(`Too many dimensions for input audio chunk ${x.length}`);
+        }
+        if (sr !== 16000 && (sr % 16000 === 0)) {
+            const step = Math.floor(sr / 16000);
+            x = x.map(row => row.filter((_, i) => i % step === 0));
+            sr = 16000;
+        }
+        if (!this.sample_rates.includes(sr)) {
+            throw new Error(`Supported sampling rates: ${this.sample_rates} (or multiply of 16000)`);
+        }
+        if (sr / x[0].length > 31.25) {
+            throw new Error("Input audio chunk is too short");
+        }
+        return [x, sr];
+    }
+    resetStates(batch_size: number = 1): void {
+        console.log(`Resetting states with batch_size: ${batch_size}`);
+        this._state = Array(2).fill(0).map(() => Array(batch_size * 128).fill(0));
+        this._context = [];
+        this._last_sr = 0;
+        this._last_batch_size = 0;
+    }
+    async call(x: number[][], sr: number): Promise<number[][]> {
+        console.log(`Calling model with input shape: [${x.length}, ${x[0].length}], sample rate: ${sr}`);
+        await this.ready();
+        [x, sr] = this._validate_input(x, sr);
+        const num_samples = sr === 16000 ? 512 : 256;
+        if (x[0].length !== num_samples) {
+            throw new Error(`Provided number of samples is ${x[0].length} (Supported values: 256 for 8000 sample rate, 512 for 16000)`);
+        }
+        const batch_size = x.length;
+        const context_size = sr === 16000 ? 64 : 32;
+        if (!this._last_batch_size) {
+            this.resetStates(batch_size);
+        }
+        if (this._last_sr && this._last_sr !== sr) {
+            this.resetStates(batch_size);
+        }
+        if (this._last_batch_size && this._last_batch_size !== batch_size) {
+            this.resetStates(batch_size);
+        }
+        if (this._context.length === 0) {
+            this._context = Array(batch_size * context_size).fill(0);
+        }
+        x = x.map((row, i) => [...this._context.slice(i * context_size, (i + 1) * context_size), ...row]);
+        if (sr === 8000 || sr === 16000) {
+            const inputTensor = new ort.Tensor('float32', x.flat(), [batch_size, x[0].length]);
+            const stateTensor = new ort.Tensor('float32', this._state.flat(), [2, batch_size, 128]);
+            const srTensor = new ort.Tensor('int64', [sr], []);
+            const feeds: Record<string, ort.Tensor> = {
+                input: inputTensor,
+                state: stateTensor,
+                sr: srTensor
+            };
+            const results = await this.session.run(feeds);
+            const outputData = results.output.data as Float32Array;
+            const stateData = results.stateN.data as Float32Array;
+            this._state = Array(2).fill(0).map((_, i) =>
+                Array.from(stateData.slice(i * batch_size * 128, (i + 1) * batch_size * 128))
+            );
+            const outputShape = results.output.dims as number[];
+            const out = Array(outputShape[0]).fill(0).map((_, i) =>
+                Array.from(outputData.slice(i * outputShape[1], (i + 1) * outputShape[1]))
+            );
+            this._context = x.map(row => row.slice(-context_size)).flat();
+            this._last_sr = sr;
+            this._last_batch_size = batch_size;
+            console.log(`Model call completed, output shape: [${out.length}, ${out[0].length}]`);
+            return out;
+        } else {
+            throw new Error(`Unsupported sample rate: ${sr}. Supported rates are 8000 and 16000.`);
+        }
+    }
+    async audio_forward(x: number[][], sr: number): Promise<number[][]> {
+        console.log(`Running audio_forward with input shape: [${x.length}, ${x[0].length}], sample rate: ${sr}`);
+        const outs: number[][][] = [];
+        [x, sr] = this._validate_input(x, sr);
+        this.resetStates();
+        const num_samples = sr === 16000 ? 512 : 256;
+        if (x[0].length % num_samples !== 0) {
+            const pad_num = num_samples - (x[0].length % num_samples);
+            x = x.map(row => [...row, ...Array(pad_num).fill(0)]);
+        }
+        for (let i = 0; i < x[0].length; i += num_samples) {
+            const wavs_batch = x.map(row => row.slice(i, i + num_samples));
+            const out_chunk = await this.call(wavs_batch, sr);
+            outs.push(out_chunk);
+        }
+        console.log(`audio_forward completed, output shape: [${outs.length}, ${outs[0].length}]`);
+        return outs.reduce((acc, curr) => acc.map((row, i) => [...row, ...curr[i]]));
+    }
+    close(): void {
+        console.log('Closing OnnxWrapper session');
+        this.session.release();
+    }
+}
+export default OnnxWrapper;

SpeechChunks.js ADDED Viewed

	@@ -0,0 +1,144 @@

+import MicrophoneAudio from './MicrophoneAudio';
+import { VadDetector } from './VoiceActivityDetector';
+export class SpeechChunks {
+    private static readonly SAMPLE_RATE = 16000;
+    private static readonly START_THRESHOLD = 0.6;
+    private static readonly END_THRESHOLD = 0.45;
+    private static readonly MIN_SILENCE_DURATION_MS = 600;
+    private static readonly SPEECH_PAD_MS = 500;
+    private static readonly WINDOW_SIZE_SAMPLES = 512;
+    private chunks: number[][];
+    private microphoneAudio: MicrophoneAudio;
+    private vadDetector: VadDetector;
+    private isSpeechActive: boolean;
+    private onSpeechStart: () => void;
+    private onSpeechEnd: (blob: Blob) => void;
+    constructor(onSpeechStart, onSpeechEnd) {
+        this.chunks = [];
+        this.isSpeechActive = false;
+        this.microphoneAudio = new MicrophoneAudio({
+            sampleRate: SpeechChunks.SAMPLE_RATE,
+            windowSizeSamples: SpeechChunks.WINDOW_SIZE_SAMPLES,
+            onAudioData: this.processAudioData.bind(this)
+        });
+        this.onSpeechStart = onSpeechStart;
+        this.onSpeechEnd = onSpeechEnd;
+        this.vadDetector = new VadDetector(
+            SpeechChunks.START_THRESHOLD,
+            SpeechChunks.END_THRESHOLD,
+            SpeechChunks.SAMPLE_RATE,
+            SpeechChunks.MIN_SILENCE_DURATION_MS,
+            SpeechChunks.SPEECH_PAD_MS
+        );
+        console.log('SpeechChunks initialized');
+    }
+    private async processAudioData(audioData: Float32Array): Promise<void> {
+        console.log(`Processing audio data of length ${audioData.length}`);
+        try {
+            const result = await this.vadDetector.apply(audioData, false);
+            if (result.start !== undefined) {
+                this.isSpeechActive = true;
+                console.log('Speech start detected');
+                this.onSpeechStart();
+            } else if (result.end !== undefined) {
+                this.isSpeechActive = false;
+                console.log('Speech end detected');
+                this.onSpeechEnd(this.getBlob());
+            }
+            if (this.isSpeechActive) {
+                console.log('Adding chunk to speech');
+                this.chunks.push(Array.from(audioData));
+            }
+        } catch (error) {
+            console.error('Error processing audio data', error);
+        }
+    }
+    async start(): Promise<void> {
+        console.log('Starting SpeechChunks');
+        await this.microphoneAudio.start();
+    }
+    stop(): void {
+        console.log('Stopping SpeechChunks');
+        this.microphoneAudio.stop();
+        this.vadDetector.reset();
+        this.isSpeechActive = false;
+    }
+    getSpeechChunks(): number[][] {
+        console.log(`Returning ${this.chunks.length} speech chunks`);
+        const speechChunks = this.chunks;
+        this.chunks = [];
+        return speechChunks;
+    }
+    getBlob(): Blob {
+        console.log('Creating audio blob from speech chunks');
+        // Combine all chunks into a single Float32Array
+        const combinedChunks = this.chunks;
+        const combinedLength = combinedChunks.reduce((sum, chunk) => sum + chunk.length, 0);
+        const combinedAudio = new Float32Array(combinedLength);
+        let offset = 0;
+        for (const chunk of combinedChunks) {
+            combinedAudio.set(chunk, offset);
+            offset += chunk.length;
+        }
+        // Convert Float32Array to Int16Array (common format for WAV files)
+        const intData = new Int16Array(combinedAudio.length);
+        for (let i = 0; i < combinedAudio.length; i++) {
+            const s = Math.max(-1, Math.min(1, combinedAudio[i]));
+            intData[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
+        }
+        // Create WAV file header
+        const header = new ArrayBuffer(44);
+        const view = new DataView(header);
+        // RIFF chunk descriptor
+        this.writeString(view, 0, 'RIFF');
+        view.setUint32(4, 36 + intData.length * 2, true);
+        this.writeString(view, 8, 'WAVE');
+        // FMT sub-chunk
+        this.writeString(view, 12, 'fmt ');
+        view.setUint32(16, 16, true); // subchunk1size
+        view.setUint16(20, 1, true); // audio format (1 for PCM)
+        view.setUint16(22, 1, true); // num of channels
+        view.setUint32(24, SpeechChunks.SAMPLE_RATE, true); // sample rate
+        view.setUint32(28, SpeechChunks.SAMPLE_RATE * 2, true); // byte rate
+        view.setUint16(32, 2, true); // block align
+        view.setUint16(34, 16, true); // bits per sample
+        // Data sub-chunk
+        this.writeString(view, 36, 'data');
+        view.setUint32(40, intData.length * 2, true);
+        // Combine header and data
+        const blob = new Blob([header, intData], { type: 'audio/wav' });
+        console.log(`Created blob of size ${blob.size} bytes`);
+        return blob;
+    }
+    // Helper function to write strings to DataView
+    private writeString(view: DataView, offset: number, string: string): void {
+        for (let i = 0; i < string.length; i++) {
+            view.setUint8(offset + i, string.charCodeAt(i));
+        }
+    }
+    async close(): Promise<void> {
+        console.log('Closing SpeechChunks');
+        this.stop();
+        await this.vadDetector.close();
+    }
+}

VoiceActivityDetector.js ADDED Viewed

	@@ -0,0 +1,129 @@

+import OnnxWrapper from './Silero'; // Assuming you have this class implemented
+const modelPath = process.env.VAD_MODEL_PATH;
+export class VadDetector {
+    private model: OnnxWrapper;
+    private startThreshold: number;
+    private endThreshold: number;
+    private samplingRate: number;
+    private minSilenceSamples: number;
+    private speechPadSamples: number;
+    private triggered: boolean;
+    private tempEnd: number;
+    private currentSample: number;
+    constructor(
+        startThreshold: number,
+        endThreshold: number,
+        samplingRate: number,
+        minSilenceDurationMs: number,
+        speechPadMs: number
+    ) {
+        if (samplingRate !== 8000 && samplingRate !== 16000) {
+            throw new Error("Does not support sampling rates other than [8000, 16000]");
+        }
+        this.model = new OnnxWrapper(modelPath);
+        this.startThreshold = startThreshold;
+        this.endThreshold = endThreshold;
+        this.samplingRate = samplingRate;
+        this.minSilenceSamples = samplingRate * minSilenceDurationMs / 1000;
+        this.speechPadSamples = samplingRate * speechPadMs / 1000;
+        this.reset();
+        console.log(`VadDetector initialized with: startThreshold=${startThreshold}, endThreshold=${endThreshold}, samplingRate=${samplingRate}`);
+    }
+    reset(): void {
+        this.model.resetStates();
+        this.triggered = false;
+        this.tempEnd = 0;
+        this.currentSample = 0;
+        console.log('VadDetector reset');
+    }
+    async apply(data: Float32Array, returnSeconds: boolean): Promise<{ start?: number; end?: number }> {
+        console.log(`Applying VAD to data of length ${data.length}`);
+        const windowSizeSamples = data.length;
+        this.currentSample += windowSizeSamples;
+        // Determine the row length based on the sampling rate
+        const rowLength = this.samplingRate === 16000 ? 512 : 256;
+        // Calculate the number of rows
+        const numRows = Math.ceil(data.length / rowLength);
+        // Create the 2D array
+        const x: number[][] = [];
+        for (let i = 0; i < numRows; i++) {
+            const start = i * rowLength;
+            const end = Math.min(start + rowLength, data.length);
+            x.push(Array.from(data.slice(start, end)));
+            // If the last row is not full, pad it with zeros
+            if (end - start < rowLength) {
+                x[i] = x[i].concat(new Array(rowLength - (end - start)).fill(0));
+            }
+        }
+        let speechProb: number;
+        try {
+            let speechProbPromise = await this.model.call(x, this.samplingRate);
+            if (speechProbPromise && Array.isArray(speechProbPromise) && speechProbPromise[0]) {
+                speechProb = speechProbPromise[0][0];
+                console.log(`Speech probability: ${speechProb}`);
+            } else {
+                throw new Error("Unexpected response from model");
+            }
+        } catch (e) {
+            console.error("Error in VadDetector.apply:", e);
+            throw new Error("Error calling the model: " + e);
+        }
+        if (speechProb >= this.startThreshold && this.tempEnd !== 0) {
+            this.tempEnd = 0;
+        }
+        if (speechProb >= this.startThreshold && !this.triggered) {
+            this.triggered = true;
+            let speechStart = Math.max(this.currentSample - this.speechPadSamples, 0);
+            console.log(`Speech start detected at sample ${speechStart}`);
+            if (returnSeconds) {
+                const speechStartSeconds = speechStart / this.samplingRate;
+                return { start: Number(speechStartSeconds.toFixed(1)) };
+            } else {
+                return { start: speechStart };
+            }
+        }
+        if (speechProb < this.endThreshold && this.triggered) {
+            console.log(`Potential speech end at sample ${this.currentSample}`);
+            if (this.tempEnd === 0) {
+                this.tempEnd = this.currentSample;
+            }
+            if (this.currentSample - this.tempEnd < this.minSilenceSamples) {
+                console.log('Silence duration too short, continuing');
+                return {};
+            } else {
+                const speechEnd = this.tempEnd + this.speechPadSamples;
+                console.log(`Speech end confirmed at sample ${speechEnd}`);
+                this.tempEnd = 0;
+                this.triggered = false;
+                if (returnSeconds) {
+                    const speechEndSeconds = speechEnd / this.samplingRate;
+                    return { end: Number(speechEndSeconds.toFixed(1)) };
+                } else {
+                    return { end: speechEnd };
+                }
+            }
+        }
+        return {};
+    }
+    async close(): Promise<void> {
+        this.reset();
+        await this.model.close();
+    }
+}

index.html CHANGED Viewed

@@ -1,19 +1,85 @@
-<!doctype html>
-<html>
-	<head>
-		<meta charset="utf-8" />
-		<meta name="viewport" content="width=device-width" />
-		<title>My static Space</title>
-		<link rel="stylesheet" href="style.css" />
-	</head>
-	<body>
-		<div class="card">
-			<h1>Welcome to your static Space!</h1>
-			<p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
-			<p>
-				Also don't forget to check the
-				<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
-			</p>
-		</div>
-	</body>
-</html>

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Voice Activity Detection Demo</title>
+    <script src="https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/ort.min.js"></script>
+    <style>
+        body {{
+            font-family: Arial, sans-serif;
+            max-width: 800px;
+            margin: 0 auto;
+            padding: 20px;
+        }}
+        #status {{
+            font-weight: bold;
+            margin-bottom: 10px;
+        }}
+        #audioList {{
+            margin-top: 20px;
+        }}
+    </style>
+</head>
+<body>
+    <h1>Voice Activity Detection Demo</h1>
+    <div id="status">Not listening</div>
+    <button id="startButton">Start Listening</button>
+    <button id="stopButton" disabled>Stop Listening</button>
+    <div id="audioList"></div>
+    <script type="module">
+        {speech_chunks_js}
+        {microphone_audio_js}
+        {silero_js}
+        {voice_activity_detector_js}
+        const status = document.getElementById('status');
+        const startButton = document.getElementById('startButton');
+        const stopButton = document.getElementById('stopButton');
+        const audioList = document.getElementById('audioList');
+        let speechChunks;
+        startButton.addEventListener('click', async () => {{
+            speechChunks = new SpeechChunks(
+                () => {{
+                    console.log("Speech start");
+                    status.textContent = "Listening...";
+                }},
+                (blob) => {{
+                    console.log("Speech end");
+                    status.textContent = "Not listening";
+                    const audio = new Audio(URL.createObjectURL(blob));
+                    const listItem = document.createElement('div');
+                    listItem.appendChild(audio);
+                    const playButton = document.createElement('button');
+                    playButton.textContent = 'Play';
+                    playButton.onclick = () => audio.play();
+                    listItem.appendChild(playButton);
+                    audioList.appendChild(listItem);
+                }}
+            );
+            try {{
+                await speechChunks.start();
+                startButton.disabled = true;
+                stopButton.disabled = false;
+                status.textContent = "Listening...";
+            }} catch (error) {{
+                console.error("Failed to start VAD:", error);
+                status.textContent = "Error starting VAD";
+            }}
+        }});
+        stopButton.addEventListener('click', () => {{
+            if (speechChunks) {{
+                speechChunks.stop();
+                startButton.disabled = false;
+                stopButton.disabled = true;
+                status.textContent = "Not listening";
+            }}
+        }});
+    </script>
+</body>
+</html>

silero_vad.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2623a2953f6ff3d2c1e61740c6cdb7168133479b267dfef114a4a3cc5bdd788f
+size 2327524