Spaces:

axrzce
/

Comp-I

Running

File size: 11,476 Bytes

338d95d

"""
Audio processing utilities for CompI Phase 2.A: Audio Input Integration

This module provides comprehensive audio analysis capabilities including:
- Audio feature extraction (tempo, energy, spectral features)
- Audio preprocessing and normalization
- Audio-to-text captioning using OpenAI Whisper
- Multimodal prompt fusion combining audio features with text prompts
"""

import os
import numpy as np
import librosa
import soundfile as sf
from typing import Dict, List, Optional, Tuple, Union
import logging
from dataclasses import dataclass

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class AudioFeatures:
    """Container for extracted audio features"""
    tempo: float
    energy: float  # RMS energy
    zero_crossing_rate: float
    spectral_centroid: float
    spectral_rolloff: float
    mfcc_mean: np.ndarray
    chroma_mean: np.ndarray
    duration: float
    sample_rate: int
    
    def to_dict(self) -> Dict:
        """Convert to dictionary for JSON serialization"""
        return {
            'tempo': float(self.tempo),
            'energy': float(self.energy),
            'zero_crossing_rate': float(self.zero_crossing_rate),
            'spectral_centroid': float(self.spectral_centroid),
            'spectral_rolloff': float(self.spectral_rolloff),
            'mfcc_mean': self.mfcc_mean.tolist() if hasattr(self.mfcc_mean, 'tolist') else list(self.mfcc_mean),
            'chroma_mean': self.chroma_mean.tolist() if hasattr(self.chroma_mean, 'tolist') else list(self.chroma_mean),
            'duration': float(self.duration),
            'sample_rate': int(self.sample_rate)
        }

class AudioProcessor:
    """Comprehensive audio processing and analysis"""
    
    def __init__(self, target_sr: int = 16000, max_duration: float = 60.0):
        """
        Initialize audio processor
        
        Args:
            target_sr: Target sample rate for processing
            max_duration: Maximum audio duration to process (seconds)
        """
        self.target_sr = target_sr
        self.max_duration = max_duration
        
    def load_audio(self, audio_path: str) -> Tuple[np.ndarray, int]:
        """
        Load and preprocess audio file
        
        Args:
            audio_path: Path to audio file
            
        Returns:
            Tuple of (audio_data, sample_rate)
        """
        try:
            # Load audio with librosa
            audio, sr = librosa.load(
                audio_path, 
                sr=self.target_sr, 
                duration=self.max_duration
            )
            
            # Normalize audio
            audio = librosa.util.normalize(audio)
            
            logger.info(f"Loaded audio: {audio_path}, duration: {len(audio)/sr:.2f}s")
            return audio, sr
            
        except Exception as e:
            logger.error(f"Error loading audio {audio_path}: {e}")
            raise
    
    def extract_features(self, audio: np.ndarray, sr: int) -> AudioFeatures:
        """
        Extract comprehensive audio features
        
        Args:
            audio: Audio signal
            sr: Sample rate
            
        Returns:
            AudioFeatures object containing all extracted features
        """
        try:
            # Basic features
            duration = len(audio) / sr
            
            # Tempo and beat tracking
            tempo, _ = librosa.beat.beat_track(y=audio, sr=sr)
            
            # Energy (RMS)
            rms = librosa.feature.rms(y=audio)[0]
            energy = np.sqrt(np.mean(rms**2))
            
            # Zero crossing rate
            zcr = librosa.feature.zero_crossing_rate(audio)[0]
            zcr_mean = np.mean(zcr)
            
            # Spectral features
            spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
            spectral_centroid = np.mean(spectral_centroids)
            
            spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]
            spectral_rolloff_mean = np.mean(spectral_rolloff)
            
            # MFCC features
            mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
            mfcc_mean = np.mean(mfccs, axis=1)
            
            # Chroma features
            chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
            chroma_mean = np.mean(chroma, axis=1)
            
            features = AudioFeatures(
                tempo=float(tempo),
                energy=float(energy),
                zero_crossing_rate=float(zcr_mean),
                spectral_centroid=float(spectral_centroid),
                spectral_rolloff=float(spectral_rolloff_mean),
                mfcc_mean=mfcc_mean,
                chroma_mean=chroma_mean,
                duration=float(duration),
                sample_rate=int(sr)
            )
            
            logger.info(f"Extracted features: tempo={float(tempo):.1f}, energy={float(energy):.4f}")
            return features
            
        except Exception as e:
            logger.error(f"Error extracting audio features: {e}")
            raise
    
    def analyze_audio_file(self, audio_path: str) -> AudioFeatures:
        """
        Complete audio analysis pipeline
        
        Args:
            audio_path: Path to audio file
            
        Returns:
            AudioFeatures object
        """
        audio, sr = self.load_audio(audio_path)
        return self.extract_features(audio, sr)

class AudioCaptioner:
    """Audio-to-text captioning using OpenAI Whisper"""
    
    def __init__(self, model_size: str = "base", device: str = "auto"):
        """
        Initialize audio captioner
        
        Args:
            model_size: Whisper model size (tiny, base, small, medium, large)
            device: Device to run on (auto, cpu, cuda)
        """
        self.model_size = model_size
        self.device = device
        self._model = None
        
    def _load_model(self):
        """Lazy load Whisper model"""
        if self._model is None:
            try:
                import whisper
                self._model = whisper.load_model(self.model_size, device=self.device)
                logger.info(f"Loaded Whisper model: {self.model_size}")
            except ImportError:
                logger.error("OpenAI Whisper not installed. Install with: pip install openai-whisper")
                raise
            except Exception as e:
                logger.error(f"Error loading Whisper model: {e}")
                raise
    
    def caption_audio(self, audio_path: str, language: str = "en") -> str:
        """
        Generate text caption from audio
        
        Args:
            audio_path: Path to audio file
            language: Language code for transcription
            
        Returns:
            Text caption of the audio content
        """
        self._load_model()
        
        try:
            import whisper
            
            # Load and preprocess audio for Whisper
            audio = whisper.load_audio(audio_path)
            audio = whisper.pad_or_trim(audio)
            
            # Generate mel spectrogram
            mel = whisper.log_mel_spectrogram(audio).to(self._model.device)
            
            # Decode audio
            options = whisper.DecodingOptions(language=language, fp16=False)
            result = whisper.decode(self._model, mel, options)
            
            caption = result.text.strip()
            logger.info(f"Generated audio caption: '{caption[:50]}...'")
            
            return caption
            
        except Exception as e:
            logger.error(f"Error captioning audio: {e}")
            return ""

class MultimodalPromptFusion:
    """Intelligent fusion of text prompts with audio features and captions"""
    
    def __init__(self):
        """Initialize prompt fusion system"""
        pass
    
    def fuse_prompt_with_audio(
        self, 
        text_prompt: str,
        style: str,
        mood: str,
        audio_features: AudioFeatures,
        audio_caption: str = ""
    ) -> str:
        """
        Create enhanced prompt by fusing text with audio analysis
        
        Args:
            text_prompt: Original text prompt
            style: Art style
            mood: Mood/atmosphere
            audio_features: Extracted audio features
            audio_caption: Audio caption from Whisper
            
        Returns:
            Enhanced multimodal prompt
        """
        # Start with base prompt
        enhanced_prompt = text_prompt.strip()
        
        # Add style and mood
        if style:
            enhanced_prompt += f", {style}"
        if mood:
            enhanced_prompt += f", {mood}"
        
        # Add audio caption if available
        if audio_caption:
            enhanced_prompt += f", inspired by the sound of: {audio_caption}"
        
        # Add tempo-based descriptors
        if audio_features.tempo < 80:
            enhanced_prompt += ", slow and contemplative"
        elif audio_features.tempo > 140:
            enhanced_prompt += ", fast-paced and energetic"
        elif audio_features.tempo > 120:
            enhanced_prompt += ", upbeat and dynamic"
        
        # Add energy-based descriptors
        if audio_features.energy > 0.05:
            enhanced_prompt += ", vibrant and powerful"
        elif audio_features.energy < 0.02:
            enhanced_prompt += ", gentle and subtle"
        
        # Add rhythm-based descriptors
        if audio_features.zero_crossing_rate > 0.15:
            enhanced_prompt += ", rhythmic and percussive"
        
        # Add tonal descriptors based on spectral features
        if audio_features.spectral_centroid > 3000:
            enhanced_prompt += ", bright and crisp"
        elif audio_features.spectral_centroid < 1500:
            enhanced_prompt += ", warm and deep"
        
        logger.info(f"Enhanced prompt: {enhanced_prompt}")
        return enhanced_prompt
    
    def generate_audio_tags(self, audio_features: AudioFeatures) -> List[str]:
        """
        Generate descriptive tags based on audio features
        
        Args:
            audio_features: Extracted audio features
            
        Returns:
            List of descriptive tags
        """
        tags = []
        
        # Tempo tags
        if audio_features.tempo < 60:
            tags.append("very_slow")
        elif audio_features.tempo < 90:
            tags.append("slow")
        elif audio_features.tempo < 120:
            tags.append("moderate")
        elif audio_features.tempo < 140:
            tags.append("fast")
        else:
            tags.append("very_fast")
        
        # Energy tags
        if audio_features.energy > 0.06:
            tags.append("high_energy")
        elif audio_features.energy > 0.03:
            tags.append("medium_energy")
        else:
            tags.append("low_energy")
        
        # Rhythm tags
        if audio_features.zero_crossing_rate > 0.15:
            tags.append("percussive")
        elif audio_features.zero_crossing_rate < 0.05:
            tags.append("smooth")
        
        # Spectral tags
        if audio_features.spectral_centroid > 3000:
            tags.append("bright")
        elif audio_features.spectral_centroid < 1500:
            tags.append("dark")
        
        return tags