FluidInference
/

silero-vad-coreml

@@ -1,289 +0,0 @@
-#!/usr/bin/env python3
-"""
-Optimal VAD Implementation using RNN Decoder + Fixed Classifier
-This uses the best combination discovered:
-- silero_rnn_decoder.mlmodel (proper output magnitudes)
-- correct_classifier_conv1d.mlpackage (fixed Conv1d)
-"""
-import os
-import librosa
-import coremltools as ct
-import numpy as np
-class OptimalCoreMLVAD:
-    """
-    Optimal VAD using RNN Decoder + Fixed Classifier
-    """
-    def __init__(self):
-        """Initialize the VAD pipeline with optimal models"""
-        print("Loading Optimal CoreML models...")
-        # Load existing preprocessing models with explicit ANE preference
-        self.stft_model = ct.models.MLModel("silero_stft.mlmodel", compute_units=ct.ComputeUnit.ALL)
-        self.encoder_model = ct.models.MLModel("silero_encoder.mlmodel", compute_units=ct.ComputeUnit.ALL)
-        # Load OPTIMAL combination with ANE preference
-        self.rnn_model = ct.models.MLModel("silero_rnn_decoder.mlmodel", compute_units=ct.ComputeUnit.ALL)
-        self.classifier_model = ct.models.MLModel("correct_classifier_conv1d.mlpackage", compute_units=ct.ComputeUnit.ALL)
-        print("✅ Optimal models loaded:")
-        print("  - STFT: silero_stft.mlmodel")
-        print("  - Encoder: silero_encoder.mlmodel")
-        print("  - RNN: silero_rnn_decoder.mlmodel (🥇 BEST)")
-        print("  - Classifier: correct_classifier_conv1d.mlpackage (🔧 FIXED)")
-        print("🧠 All models configured for Neural Engine (ANE) acceleration")
-        # Initialize state for RNN Decoder (requires 3D states)
-        self.h_state = np.zeros((1, 1, 128), dtype=np.float32)
-        self.c_state = np.zeros((1, 1, 128), dtype=np.float32)
-        # Initialize feature buffer for temporal context
-        self.feature_buffer = []
-        print("✅ Optimal VAD loaded successfully!")
-    def reset_state(self):
-        """Reset the RNN state and feature buffer"""
-        self.h_state = np.zeros((1, 1, 128), dtype=np.float32)
-        self.c_state = np.zeros((1, 1, 128), dtype=np.float32)
-        if hasattr(self, 'feature_buffer'):
-            self.feature_buffer = []
-    def process_chunk(self, audio_chunk):
-        """Process audio chunk using optimal model combination"""
-        # Ensure correct shape
-        if audio_chunk.ndim == 1:
-            audio_chunk = audio_chunk.reshape(1, -1)
-        # STFT processing
-        stft_result = self.stft_model.predict({"audio_input": audio_chunk})
-        stft_output_key = list(stft_result.keys())[0]
-        stft_features = stft_result[stft_output_key]
-        # Temporal context management
-        if not hasattr(self, 'feature_buffer'):
-            self.feature_buffer = []
-        # Add current features to buffer
-        self.feature_buffer.append(stft_features)
-        # Keep only the last 4 frames for temporal context
-        if len(self.feature_buffer) > 4:
-            self.feature_buffer = self.feature_buffer[-4:]
-        # Pad with zeros if we have less than 4 frames
-        while len(self.feature_buffer) < 4:
-            self.feature_buffer.insert(0, np.zeros_like(stft_features))
-        # Concatenate along time dimension
-        stft_features = np.concatenate(self.feature_buffer, axis=-1)
-        # Encoder processing
-        encoder_result = self.encoder_model.predict({"stft_features": stft_features})
-        encoder_output_key = list(encoder_result.keys())[0]
-        encoder_features = encoder_result[encoder_output_key]
-        # Reshape encoder features for RNN
-        encoder_features = np.transpose(encoder_features, (0, 2, 1))  # (1, T, 64)
-        # Take only the last 4 timesteps
-        if encoder_features.shape[1] > 4:
-            encoder_features = encoder_features[:, -4:, :]
-        elif encoder_features.shape[1] < 4:
-            # Pad with zeros if needed
-            padding = 4 - encoder_features.shape[1]
-            pad_shape = (encoder_features.shape[0], padding, encoder_features.shape[2])
-            encoder_features = np.concatenate([np.zeros(pad_shape), encoder_features], axis=1)
-        # Ensure the feature dimension is 128 for RNN
-        if encoder_features.shape[2] != 128:
-            # Resize/pad to 128 dimensions
-            if encoder_features.shape[2] > 128:
-                encoder_features = encoder_features[:, :, :128]
-            else:
-                padding = 128 - encoder_features.shape[2]
-                pad_shape = (encoder_features.shape[0], encoder_features.shape[1], padding)
-                encoder_features = np.concatenate([encoder_features, np.zeros(pad_shape)], axis=2)
-        # RNN Decoder processing with proper state management
-        rnn_result = self.rnn_model.predict({
-            "encoder_features": encoder_features,
-            "h_in": self.h_state,
-            "c_in": self.c_state
-        })
-        # Extract RNN Decoder outputs properly
-        rnn_features = None
-        new_h_state = None
-        new_c_state = None
-        # RNN Decoder has specific output names - find them by shape
-        for key, value in rnn_result.items():
-            if len(value.shape) == 3 and value.shape[1] > 1:  # Sequence output
-                rnn_features = value
-            elif len(value.shape) == 3 and value.shape == (1, 1, 128):  # State outputs
-                if new_h_state is None:
-                    new_h_state = value
-                else:
-                    new_c_state = value
-        # Update states for next chunk
-        if new_h_state is not None:
-            self.h_state = new_h_state
-        if new_c_state is not None:
-            self.c_state = new_c_state
-        # Ensure we have the sequence output
-        if rnn_features is None:
-            raise RuntimeError("Could not find RNN sequence output")
-        # Ensure correct shape for classifier (1, 4, 128)
-        if rnn_features.shape != (1, 4, 128):
-            if rnn_features.shape[1] != 4:
-                if rnn_features.shape[1] > 4:
-                    rnn_features = rnn_features[:, -4:, :]
-                else:
-                    last_timestep = rnn_features[:, -1:, :]
-                    padding_needed = 4 - rnn_features.shape[1]
-                    padding = np.repeat(last_timestep, padding_needed, axis=1)
-                    rnn_features = np.concatenate([rnn_features, padding], axis=1)
-            if rnn_features.shape[2] != 128:
-                if rnn_features.shape[2] > 128:
-                    rnn_features = rnn_features[:, :, :128]
-                else:
-                    padding = 128 - rnn_features.shape[2]
-                    pad_shape = (rnn_features.shape[0], rnn_features.shape[1], padding)
-                    rnn_features = np.concatenate([rnn_features, np.zeros(pad_shape)], axis=2)
-        # Classifier processing with fixed Conv1d model (clean output!)
-        classifier_result = self.classifier_model.predict({"rnn_features": rnn_features})
-        classifier_output_key = list(classifier_result.keys())[0]
-        vad_prob = float(classifier_result[classifier_output_key].squeeze())
-        return vad_prob
-def process_file(filename, vad, sample_rate=16000, chunk_size=512, threshold=0.5):
-    """Process audio file with VAD and display results"""
-    print(f"\n🎧 Processing: {filename}")
-    # Reset state for new file
-    vad.reset_state()
-    # Load audio
-    y, _ = librosa.load(filename, sr=sample_rate)
-    if y.ndim > 1:
-        y = librosa.to_mono(y)
-    num_chunks = len(y) // chunk_size
-    vad_scores = []
-    for i in range(num_chunks):
-        start = i * chunk_size
-        end = start + chunk_size
-        chunk = y[start:end]
-        if len(chunk) < chunk_size:
-            break  # Skip last short chunk
-        prob = vad.process_chunk(chunk.astype(np.float32))
-        vad_scores.append(prob)
-    # Average VAD probability across all chunks
-    avg_vad = np.mean(vad_scores) if vad_scores else 0.0
-    status = "🟢 Speech" if avg_vad >= threshold else "⚫️ Silence"
-    print(f"{os.path.basename(filename):<18} | Avg VAD: {avg_vad:.4f} | {status}")
-def test_optimal_vad():
-    """Test the optimal VAD implementation"""
-    print("🚀 Testing OPTIMAL VAD Implementation")
-    print("=" * 60)
-    print("🥇 Using BEST model combination:")
-    print("   - RNN: silero_rnn_decoder.mlmodel")
-    print("   - Classifier: correct_classifier_conv1d.mlpackage")
-    print()
-    vad = OptimalCoreMLVAD()
-    test_folder = "test"
-    if not os.path.exists(test_folder):
-        print(f"❌ Test folder '{test_folder}' not found!")
-        return
-    test_files = sorted(f for f in os.listdir(test_folder) if f.endswith(".mp3"))
-    if not test_files:
-        print(f"❌ No MP3 files found in '{test_folder}' folder!")
-        return
-    print(f"{'File':<18} | {'VAD Score':<9} | {'Result'}")
-    print("-" * 50)
-    human_scores = []
-    ambient_scores = []
-    for file in test_files:
-        full_path = os.path.join(test_folder, file)
-        # Capture the score for analysis
-        vad.reset_state()
-        y, _ = librosa.load(full_path, sr=16000)
-        if y.ndim > 1:
-            y = librosa.to_mono(y)
-        chunk_size = 512
-        num_chunks = min(10, len(y) // chunk_size)
-        vad_scores = []
-        for i in range(num_chunks):
-            start = i * chunk_size
-            end = start + chunk_size
-            chunk = y[start:end]
-            if len(chunk) < chunk_size:
-                break
-            prob = vad.process_chunk(chunk.astype(np.float32))
-            vad_scores.append(prob)
-        avg_vad = np.mean(vad_scores) if vad_scores else 0.0
-        # Categorize for analysis
-        if "human" in file:
-            human_scores.append(avg_vad)
-        elif "ambient" in file:
-            ambient_scores.append(avg_vad)
-        # Display result
-        status = "🟢 Speech" if avg_vad >= 0.5 else "⚫️ Silence"
-        print(f"{os.path.basename(file):<18} | {avg_vad:.4f}    | {status}")
-    # Analysis
-    if human_scores and ambient_scores:
-        human_avg = np.mean(human_scores)
-        ambient_avg = np.mean(ambient_scores)
-        separation = human_avg - ambient_avg
-        print(f"\n📊 PERFORMANCE ANALYSIS:")
-        print(f"   👤 Human average:   {human_avg:.4f}")
-        print(f"   🌿 Ambient average: {ambient_avg:.4f}")
-        print(f"   📈 Separation:      {separation:.4f}")
-        if separation > 0.05:
-            print(f"   ✅ EXCELLENT: Strong separation")
-        elif separation > 0.01:
-            print(f"   ✅ GOOD: Clear separation")
-        elif separation > 0:
-            print(f"   ⚠️  WEAK: Small separation")
-        else:
-            print(f"   ❌ POOR: No separation or inverted")
-    print("\n✅ Optimal VAD testing completed!")
-if __name__ == "__main__":
-    test_optimal_vad()