Spaces:

abdu-l7hman
/

stutter-detector

Running

App Files Files Community

abdu-l7hman commited on 7 days ago

Commit

1154abd

0 Parent(s):

Initial commit with model and app

Browse files

Files changed (9) hide show

.dockerignore +10 -0
.gitattributes +1 -0
Dockerfile +32 -0
app.py +126 -0
models/__pycache__/stutter_detector_local.cpython-312.pyc +0 -0
models/__pycache__/stutter_detector_local.cpython-314.pyc +0 -0
models/stutter_detector_all_types.pth +3 -0
models/stutter_detector_local.py +300 -0
requirements.txt +11 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,10 @@

+# .dockerignore
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.DS_Store
+.env
+venv/
+.git/

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ models/*.pth filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+# Dockerfile
+# 1. Base Image
+FROM python:3.9-slim
+# 2. Install system dependencies (ffmpeg, libsndfile)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libsndfile1 \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+# 3. Create a non-root user (Required for Hugging Face Spaces)
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+# 4. Set working directory
+WORKDIR /home/user/app
+# 5. Copy requirements and install
+COPY --chown=user requirements.txt .
+RUN pip install --no-cache-dir --timeout=600 -r requirements.txt
+# 6. Copy the application code
+COPY --chown=user . .
+# 7. Expose the correct port for Hugging Face
+EXPOSE 7860
+# 8. Run with Gunicorn on port 7860
+# Note: Increased timeout to 120s because audio processing on CPU can be slow
+CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "1", "--timeout", "120", "app:app"]

app.py ADDED Viewed

	@@ -0,0 +1,126 @@

+from flask import Flask, render_template, request, jsonify
+from flask_cors import CORS  # <--- ADDED THIS
+import os
+from werkzeug.utils import secure_filename
+import time
+import sys
+import subprocess
+import shutil
+# Ensure we can find the models folder
+sys.path.append(os.path.join(os.path.dirname(__file__), 'models'))
+# Lazy-import model dependencies
+try:
+    from models.stutter_detector_local import ImprovedStutterDetector, calculate_stutter_severity
+    MODEL_DEPS_AVAILABLE = True
+except Exception as e:
+    print(f"Model dependencies unavailable: {e}")
+    ImprovedStutterDetector = None
+    def calculate_stutter_severity(_):
+        return None
+    MODEL_DEPS_AVAILABLE = False
+app = Flask(__name__)
+CORS(app) # <--- ENABLE CORS HERE
+# config
+UPLOAD_FOLDER = '/tmp/uploads' # Use /tmp because other folders might be read-only on HF
+ALLOWED_EXTENSIONS = {'wav', 'mp3', 'ogg', 'webm', 'm4a'}
+MODEL_PATH = 'models/stutter_detector_all_types.pth'
+app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
+app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024
+os.makedirs(UPLOAD_FOLDER, exist_ok=True)
+# Check ffmpeg
+FFMPEG_AVAILABLE = shutil.which('ffmpeg') is not None
+# Load Model
+print("Loading stutter detection model...")
+detector = None
+if MODEL_DEPS_AVAILABLE and ImprovedStutterDetector is not None:
+    try:
+        detector = ImprovedStutterDetector(MODEL_PATH, device='cpu') # Force CPU
+        print("Model loaded successfully!")
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        detector = None
+else:
+    print("Skipping model load due to missing dependencies.")
+def allowed_file(filename):
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+def convert_to_wav(input_path, output_path):
+    try:
+        subprocess.run([
+            'ffmpeg', '-i', input_path,
+            '-acodec', 'pcm_s16le',
+            '-ar', '16000',
+            '-ac', '1',
+            '-y',
+            output_path
+        ], check=True, capture_output=True)
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"FFmpeg conversion error: {e.stderr.decode()}")
+        return False
+    except FileNotFoundError:
+        return False
+# ... [Keep your analyze_audio_file function exactly as it is] ...
+# ... [Paste the analyze_audio_file function here from your original code] ...
+# Route: Only strictly necessary endpoints for your API
+@app.route('/health', methods=['GET'])
+def health_check():
+    return jsonify({'status': 'healthy', 'model_loaded': detector is not None}), 200
+@app.route('/upload', methods=['POST'])
+def upload_file():
+    # ... [Keep your existing upload logic exactly as it is] ...
+    # Just ensure you use the analyze_audio_file function defined above
+    if detector is None:
+        return jsonify({'error': 'Model not loaded.'}), 500
+    if 'audio' not in request.files:
+        return jsonify({'error': 'No audio file provided'}), 400
+    file = request.files['audio']
+    if file and allowed_file(file.filename):
+        filename = secure_filename(file.filename)
+        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+        file.save(filepath)
+        # Get params from request or default
+        segment_duration = float(request.form.get('segment_duration', 3.0))
+        stutter_threshold = float(request.form.get('stutter_threshold', 0.5))
+        try:
+            # Run analysis
+            results = detector.analyze_audio_file(
+                filepath,
+                segment_duration=segment_duration,
+                stutter_threshold=stutter_threshold
+            )
+            # Calculate severity
+            results['severity'] = calculate_stutter_severity(results)
+            # Cleanup
+            if os.path.exists(filepath): os.remove(filepath)
+            return jsonify(results), 200
+        except Exception as e:
+            if os.path.exists(filepath): os.remove(filepath)
+            return jsonify({'error': str(e)}), 500
+    return jsonify({'error': 'Invalid file'}), 400
+if __name__ == '__main__':
+    # This is only for local testing, Docker uses Gunicorn
+    app.run(host='0.0.0.0', port=7860)

models/__pycache__/stutter_detector_local.cpython-312.pyc ADDED Viewed

Binary file (14.8 kB). View file

models/__pycache__/stutter_detector_local.cpython-314.pyc ADDED Viewed

Binary file (16.2 kB). View file

models/stutter_detector_all_types.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:819effa4e2727f82295d9f7c7cd2647159ef55001c8f7eda7cb009130d45ea56
+size 378509393

models/stutter_detector_local.py ADDED Viewed

	@@ -0,0 +1,300 @@

+# ============================================
+# LOCAL PC STUTTER DETECTION SETUP
+# Run this on your local machine
+# ============================================
+import os
+import numpy as np
+import librosa
+import torch
+import torch.nn as nn
+import transformers
+from typing import List, Tuple
+import warnings
+warnings.filterwarnings('ignore')
+# ============================================
+# MODEL ARCHITECTURE (MUST MATCH TRAINING)
+# ============================================
+class ImprovedWav2VecClassifier(nn.Module):
+    """Improved classifier matching training architecture."""
+    def __init__(self, hidden_dim=768, intermediate_dim=256, output_dim=2, dropout=0.3):
+        super().__init__()
+        # Load pre-trained Wav2Vec model
+        self.wav2vec = transformers.Wav2Vec2Model.from_pretrained('facebook/wav2vec2-base')
+        # Freeze Wav2Vec parameters
+        for param in self.wav2vec.parameters():
+            param.requires_grad = False
+        # Classification head
+        self.classifier = nn.Sequential(
+            nn.Linear(hidden_dim, intermediate_dim),
+            nn.BatchNorm1d(intermediate_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(intermediate_dim, intermediate_dim // 2),
+            nn.BatchNorm1d(intermediate_dim // 2),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(intermediate_dim // 2, output_dim)
+        )
+    def forward(self, x):
+        with torch.no_grad():
+            encoder_output = self.wav2vec(x).last_hidden_state
+        pooled_features = encoder_output.mean(dim=1)
+        return self.classifier(pooled_features)
+# ============================================
+# FEATURE EXTRACTOR
+# ============================================
+class Wav2VecFeatureExtractor:
+    """Extract features from audio files."""
+    def __init__(self, model_name='facebook/wav2vec2-base', duration=3):
+        self.processor = transformers.Wav2Vec2FeatureExtractor.from_pretrained(model_name)
+        self.duration = duration
+        self.sample_rate = 16000
+    def extract_features(self, audio_data, sr):
+        try:
+            if sr != self.sample_rate:
+                audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=self.sample_rate)
+            features = self.processor(audio_data, sampling_rate=self.sample_rate, return_tensors='pt').input_values
+            return features.squeeze(0)
+        except Exception as e:
+            print(f"Error extracting features: {e}")
+            return None
+# ============================================
+# AUDIO PROCESSING FUNCTIONS
+# ============================================
+def load_audio_file(file_path: str) -> Tuple[np.ndarray, int]:
+    """Load an audio file."""
+    try:
+        audio_data, sr = librosa.load(file_path, sr=None)
+        return audio_data, sr
+    except Exception as e:
+        raise Exception(f"Error loading audio file: {e}")
+def segment_audio(audio_data: np.ndarray, sr: int, segment_duration: float = 3.0) -> List[np.ndarray]:
+    """Split audio into fixed-duration segments."""
+    segment_samples = int(segment_duration * sr)
+    segments = []
+    for i in range(0, len(audio_data), segment_samples):
+        segment = audio_data[i:i + segment_samples]
+        if len(segment) >= sr:  # At least 1 second
+            if len(segment) < segment_samples:
+                padding = segment_samples - len(segment)
+                segment = np.pad(segment, (0, padding), mode='constant')
+            segments.append(segment)
+    return segments
+def pad_or_truncate_features(features: torch.Tensor, max_length: int = 32007) -> torch.Tensor:
+    """Pad or truncate features to match expected input length."""
+    if features.size(0) < max_length:
+        padding = max_length - features.size(0)
+        features = torch.cat([features, torch.zeros(padding)], dim=0)
+    elif features.size(0) > max_length:
+        features = features[:max_length]
+    return features
+# ============================================
+# STUTTER DETECTOR CLASS
+# ============================================
+class ImprovedStutterDetector:
+    """Stutter detector for all types: prolongations, blocks, repetitions, interjections."""
+    def __init__(self, model_path: str, device: str = None):
+        # Set device
+        if device is None:
+            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        else:
+            self.device = torch.device(device)
+        print(f"Using device: {self.device}")
+        # Load model
+        print("Loading model...")
+        self.model = ImprovedWav2VecClassifier()
+        self.model.load_state_dict(torch.load(model_path, map_location=self.device))
+        self.model.to(self.device)
+        self.model.eval()
+        print("✓ Model loaded successfully!")
+        # Initialize feature extractor
+        self.feature_extractor = Wav2VecFeatureExtractor(duration=3)
+        # Class names
+        self.class_names = ['No Stutter', 'Stutter (All Types)']
+        print("\nThis model detects ALL stutter types:")
+        print("  • Prolongations (ssssso)")
+        print("  • Blocks (getting stuck)")
+        print("  • Sound Repetitions (b-b-ball)")
+        print("  • Word Repetitions (I-I-I want)")
+        print("  • Interjections (um, uh)")
+    def analyze_audio_file(self, file_path: str, segment_duration: float = 3.0,
+                          stutter_threshold: float = 0.5, show_probabilities: bool = True) -> dict:
+        """Analyze an entire audio file for stuttering."""
+        print(f"\n{'='*70}")
+        print(f"ANALYZING: {os.path.basename(file_path)}")
+        print(f"{'='*70}")
+        # Load audio
+        audio_data, sr = load_audio_file(file_path)
+        duration = len(audio_data) / sr
+        print(f"📊 Audio duration: {duration:.2f} seconds")
+        # Segment audio
+        segments = segment_audio(audio_data, sr, segment_duration)
+        print(f"📊 Number of segments: {len(segments)}")
+        if len(segments) == 0:
+            return {'error': 'Audio too short for analysis (minimum 1 second required)'}
+        # Analyze each segment
+        results = []
+        stutter_count = 0
+        print(f"\n{'='*70}")
+        print("SEGMENT ANALYSIS")
+        print(f"{'='*70}")
+        for i, segment in enumerate(segments):
+            features = self.feature_extractor.extract_features(segment, sr)
+            if features is None:
+                continue
+            features = pad_or_truncate_features(features)
+            features = features.unsqueeze(0).to(self.device)
+            with torch.no_grad():
+                outputs = self.model(features)
+                probabilities = torch.softmax(outputs, dim=1)
+                predicted_class = torch.argmax(probabilities, dim=1).item()
+                confidence = probabilities[0][predicted_class].item()
+                no_stutter_prob = probabilities[0][0].item()
+                stutter_prob = probabilities[0][1].item()
+            is_stutter = stutter_prob >= stutter_threshold
+            results.append({
+                'segment': i + 1,
+                'prediction': self.class_names[predicted_class],
+                'confidence': confidence,
+                'is_stutter': is_stutter,
+                'no_stutter_probability': no_stutter_prob,
+                'stutter_probability': stutter_prob
+            })
+            if is_stutter:
+                stutter_count += 1
+            if show_probabilities:
+                status_emoji = "🔴" if is_stutter else "🟢"
+                status_text = "STUTTER DETECTED" if is_stutter else "Clear"
+                print(f"{status_emoji} Segment {i+1}: {status_text}")
+                print(f"    No Stutter: {no_stutter_prob:.2%} | Stutter: {stutter_prob:.2%}")
+        # Calculate statistics
+        total_segments = len(results)
+        stutter_percentage = (stutter_count / total_segments * 100) if total_segments > 0 else 0
+        print(f"\n{'='*70}")
+        print("FINAL RESULTS")
+        print(f"{'='*70}")
+        print(f"✓ Total segments analyzed: {total_segments}")
+        print(f"🔴 Segments with stutter: {stutter_count}")
+        print(f"🟢 Segments without stutter: {total_segments - stutter_count}")
+        print(f"📊 Stuttering percentage: {stutter_percentage:.1f}%")
+        return {
+            'file_path': file_path,
+            'duration': duration,
+            'total_segments': total_segments,
+            'stutter_count': stutter_count,
+            'no_stutter_count': total_segments - stutter_count,
+            'stutter_percentage': stutter_percentage,
+            'segment_results': results
+        }
+# ============================================
+# SEVERITY ANALYSIS
+# ============================================
+def calculate_stutter_severity(results):
+    """Calculate detailed stutter severity metrics."""
+    segment_results = results['segment_results']
+    stutter_probs = [seg['stutter_probability'] for seg in segment_results]
+    avg_prob = sum(stutter_probs) / len(stutter_probs)
+    max_prob = max(stutter_probs)
+    min_prob = min(stutter_probs)
+    # Count segments by severity
+    severe = sum(1 for p in stutter_probs if p > 0.6)
+    moderate = sum(1 for p in stutter_probs if 0.4 < p <= 0.6)
+    mild = sum(1 for p in stutter_probs if 0.2 < p <= 0.4)
+    minimal = sum(1 for p in stutter_probs if p <= 0.2)
+    # Calculate severity score as stutters / total segments
+    total_segments = results.get('total_segments', 0)
+    stutter_count = results.get('stutter_count', 0)
+    severity_score = stutter_count / total_segments if total_segments > 0 else 0.0
+    print(f"\n{'='*70}")
+    print("DETAILED SEVERITY ANALYSIS")
+    print(f"{'='*70}")
+    print(f"Average stutter probability: {avg_prob:.2%}")
+    print(f"Peak stutter probability: {max_prob:.2%}")
+    print(f"Minimum stutter probability: {min_prob:.2%}")
+    print(f"\nSegment Severity Breakdown:")
+    print(f"  🔴 Severe (>70%): {severe} segments")
+    print(f"  🟠 Moderate (40-70%): {moderate} segments")
+    print(f"  🟡 Mild (20-40%): {mild} segments")
+    print(f"  🟢 Minimal (<20%): {minimal} segments")
+    # Overall severity
+    if avg_prob < 0.15:
+        severity = "✓ Minimal or No Stuttering"
+    elif avg_prob < 0.35:
+        severity = "⚠️ Mild Stuttering"
+    elif avg_prob < 0.60:
+        severity = "⚠️ Moderate Stuttering"
+    else:
+        severity = "🔴 Significant Stuttering"
+    print(f"\n🎯 Overall Assessment: {severity}")
+    print(f"📊 Severity Score: {severity_score:.2%} (stutters/total segments)")
+    return {
+        'average_probability': avg_prob,
+        'max_probability': max_prob,
+        'severity_level': severity,
+        'severity_score': severity_score,
+        'severe_segments': severe,
+        'moderate_segments': moderate,
+        'mild_segments': mild,
+        'minimal_segments': minimal
+    }

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+Flask
+flask-cors
+Werkzeug
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch
+transformers
+librosa
+soundfile
+numpy
+scipy
+gunicorn