File size: 11,476 Bytes
338d95d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
"""
Audio processing utilities for CompI Phase 2.A: Audio Input Integration

This module provides comprehensive audio analysis capabilities including:
- Audio feature extraction (tempo, energy, spectral features)
- Audio preprocessing and normalization
- Audio-to-text captioning using OpenAI Whisper
- Multimodal prompt fusion combining audio features with text prompts
"""

import os
import numpy as np
import librosa
import soundfile as sf
from typing import Dict, List, Optional, Tuple, Union
import logging
from dataclasses import dataclass

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class AudioFeatures:
    """Container for extracted audio features"""
    tempo: float
    energy: float  # RMS energy
    zero_crossing_rate: float
    spectral_centroid: float
    spectral_rolloff: float
    mfcc_mean: np.ndarray
    chroma_mean: np.ndarray
    duration: float
    sample_rate: int
    
    def to_dict(self) -> Dict:
        """Convert to dictionary for JSON serialization"""
        return {
            'tempo': float(self.tempo),
            'energy': float(self.energy),
            'zero_crossing_rate': float(self.zero_crossing_rate),
            'spectral_centroid': float(self.spectral_centroid),
            'spectral_rolloff': float(self.spectral_rolloff),
            'mfcc_mean': self.mfcc_mean.tolist() if hasattr(self.mfcc_mean, 'tolist') else list(self.mfcc_mean),
            'chroma_mean': self.chroma_mean.tolist() if hasattr(self.chroma_mean, 'tolist') else list(self.chroma_mean),
            'duration': float(self.duration),
            'sample_rate': int(self.sample_rate)
        }

class AudioProcessor:
    """Comprehensive audio processing and analysis"""
    
    def __init__(self, target_sr: int = 16000, max_duration: float = 60.0):
        """
        Initialize audio processor
        
        Args:
            target_sr: Target sample rate for processing
            max_duration: Maximum audio duration to process (seconds)
        """
        self.target_sr = target_sr
        self.max_duration = max_duration
        
    def load_audio(self, audio_path: str) -> Tuple[np.ndarray, int]:
        """
        Load and preprocess audio file
        
        Args:
            audio_path: Path to audio file
            
        Returns:
            Tuple of (audio_data, sample_rate)
        """
        try:
            # Load audio with librosa
            audio, sr = librosa.load(
                audio_path, 
                sr=self.target_sr, 
                duration=self.max_duration
            )
            
            # Normalize audio
            audio = librosa.util.normalize(audio)
            
            logger.info(f"Loaded audio: {audio_path}, duration: {len(audio)/sr:.2f}s")
            return audio, sr
            
        except Exception as e:
            logger.error(f"Error loading audio {audio_path}: {e}")
            raise
    
    def extract_features(self, audio: np.ndarray, sr: int) -> AudioFeatures:
        """
        Extract comprehensive audio features
        
        Args:
            audio: Audio signal
            sr: Sample rate
            
        Returns:
            AudioFeatures object containing all extracted features
        """
        try:
            # Basic features
            duration = len(audio) / sr
            
            # Tempo and beat tracking
            tempo, _ = librosa.beat.beat_track(y=audio, sr=sr)
            
            # Energy (RMS)
            rms = librosa.feature.rms(y=audio)[0]
            energy = np.sqrt(np.mean(rms**2))
            
            # Zero crossing rate
            zcr = librosa.feature.zero_crossing_rate(audio)[0]
            zcr_mean = np.mean(zcr)
            
            # Spectral features
            spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
            spectral_centroid = np.mean(spectral_centroids)
            
            spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]
            spectral_rolloff_mean = np.mean(spectral_rolloff)
            
            # MFCC features
            mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
            mfcc_mean = np.mean(mfccs, axis=1)
            
            # Chroma features
            chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
            chroma_mean = np.mean(chroma, axis=1)
            
            features = AudioFeatures(
                tempo=float(tempo),
                energy=float(energy),
                zero_crossing_rate=float(zcr_mean),
                spectral_centroid=float(spectral_centroid),
                spectral_rolloff=float(spectral_rolloff_mean),
                mfcc_mean=mfcc_mean,
                chroma_mean=chroma_mean,
                duration=float(duration),
                sample_rate=int(sr)
            )
            
            logger.info(f"Extracted features: tempo={float(tempo):.1f}, energy={float(energy):.4f}")
            return features
            
        except Exception as e:
            logger.error(f"Error extracting audio features: {e}")
            raise
    
    def analyze_audio_file(self, audio_path: str) -> AudioFeatures:
        """
        Complete audio analysis pipeline
        
        Args:
            audio_path: Path to audio file
            
        Returns:
            AudioFeatures object
        """
        audio, sr = self.load_audio(audio_path)
        return self.extract_features(audio, sr)

class AudioCaptioner:
    """Audio-to-text captioning using OpenAI Whisper"""
    
    def __init__(self, model_size: str = "base", device: str = "auto"):
        """
        Initialize audio captioner
        
        Args:
            model_size: Whisper model size (tiny, base, small, medium, large)
            device: Device to run on (auto, cpu, cuda)
        """
        self.model_size = model_size
        self.device = device
        self._model = None
        
    def _load_model(self):
        """Lazy load Whisper model"""
        if self._model is None:
            try:
                import whisper
                self._model = whisper.load_model(self.model_size, device=self.device)
                logger.info(f"Loaded Whisper model: {self.model_size}")
            except ImportError:
                logger.error("OpenAI Whisper not installed. Install with: pip install openai-whisper")
                raise
            except Exception as e:
                logger.error(f"Error loading Whisper model: {e}")
                raise
    
    def caption_audio(self, audio_path: str, language: str = "en") -> str:
        """
        Generate text caption from audio
        
        Args:
            audio_path: Path to audio file
            language: Language code for transcription
            
        Returns:
            Text caption of the audio content
        """
        self._load_model()
        
        try:
            import whisper
            
            # Load and preprocess audio for Whisper
            audio = whisper.load_audio(audio_path)
            audio = whisper.pad_or_trim(audio)
            
            # Generate mel spectrogram
            mel = whisper.log_mel_spectrogram(audio).to(self._model.device)
            
            # Decode audio
            options = whisper.DecodingOptions(language=language, fp16=False)
            result = whisper.decode(self._model, mel, options)
            
            caption = result.text.strip()
            logger.info(f"Generated audio caption: '{caption[:50]}...'")
            
            return caption
            
        except Exception as e:
            logger.error(f"Error captioning audio: {e}")
            return ""

class MultimodalPromptFusion:
    """Intelligent fusion of text prompts with audio features and captions"""
    
    def __init__(self):
        """Initialize prompt fusion system"""
        pass
    
    def fuse_prompt_with_audio(
        self, 
        text_prompt: str,
        style: str,
        mood: str,
        audio_features: AudioFeatures,
        audio_caption: str = ""
    ) -> str:
        """
        Create enhanced prompt by fusing text with audio analysis
        
        Args:
            text_prompt: Original text prompt
            style: Art style
            mood: Mood/atmosphere
            audio_features: Extracted audio features
            audio_caption: Audio caption from Whisper
            
        Returns:
            Enhanced multimodal prompt
        """
        # Start with base prompt
        enhanced_prompt = text_prompt.strip()
        
        # Add style and mood
        if style:
            enhanced_prompt += f", {style}"
        if mood:
            enhanced_prompt += f", {mood}"
        
        # Add audio caption if available
        if audio_caption:
            enhanced_prompt += f", inspired by the sound of: {audio_caption}"
        
        # Add tempo-based descriptors
        if audio_features.tempo < 80:
            enhanced_prompt += ", slow and contemplative"
        elif audio_features.tempo > 140:
            enhanced_prompt += ", fast-paced and energetic"
        elif audio_features.tempo > 120:
            enhanced_prompt += ", upbeat and dynamic"
        
        # Add energy-based descriptors
        if audio_features.energy > 0.05:
            enhanced_prompt += ", vibrant and powerful"
        elif audio_features.energy < 0.02:
            enhanced_prompt += ", gentle and subtle"
        
        # Add rhythm-based descriptors
        if audio_features.zero_crossing_rate > 0.15:
            enhanced_prompt += ", rhythmic and percussive"
        
        # Add tonal descriptors based on spectral features
        if audio_features.spectral_centroid > 3000:
            enhanced_prompt += ", bright and crisp"
        elif audio_features.spectral_centroid < 1500:
            enhanced_prompt += ", warm and deep"
        
        logger.info(f"Enhanced prompt: {enhanced_prompt}")
        return enhanced_prompt
    
    def generate_audio_tags(self, audio_features: AudioFeatures) -> List[str]:
        """
        Generate descriptive tags based on audio features
        
        Args:
            audio_features: Extracted audio features
            
        Returns:
            List of descriptive tags
        """
        tags = []
        
        # Tempo tags
        if audio_features.tempo < 60:
            tags.append("very_slow")
        elif audio_features.tempo < 90:
            tags.append("slow")
        elif audio_features.tempo < 120:
            tags.append("moderate")
        elif audio_features.tempo < 140:
            tags.append("fast")
        else:
            tags.append("very_fast")
        
        # Energy tags
        if audio_features.energy > 0.06:
            tags.append("high_energy")
        elif audio_features.energy > 0.03:
            tags.append("medium_energy")
        else:
            tags.append("low_energy")
        
        # Rhythm tags
        if audio_features.zero_crossing_rate > 0.15:
            tags.append("percussive")
        elif audio_features.zero_crossing_rate < 0.05:
            tags.append("smooth")
        
        # Spectral tags
        if audio_features.spectral_centroid > 3000:
            tags.append("bright")
        elif audio_features.spectral_centroid < 1500:
            tags.append("dark")
        
        return tags