Spaces:

nguyennp86
/

speech-emotion-recognition

Sleeping

App Files Files Community

nguyennp86 commited on Oct 3

Commit

05bc8c1

verified ·

1 Parent(s): bde3617

Create data_loader.py

Browse files

Files changed (1) hide show

src/data_loader.py +115 -0

src/data_loader.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""
+Data loading and dataset scanning utilities
+"""
+from pathlib import Path
+from typing import List, Tuple, Optional
+import config
+def extract_emotion_from_filename(filename: str) -> str:
+    """Extract emotion from RAVDESS-style filename"""
+    try:
+        parts = filename.split('-')
+        if len(parts) >= 3:
+            emotion_code = int(parts[2])
+            return config.EMOTION_MAP.get(emotion_code, 'unknown')
+    except:
+        pass
+    # Fallback: Check filename for emotion keywords
+    filename_lower = filename.lower()
+    for emotion in config.EMOTION_MAP.values():
+        if emotion in filename_lower:
+            return emotion
+    return 'unknown'
+def extract_actor_from_filename(filename: str) -> str:
+    """Extract actor ID from filename"""
+    try:
+        parts = filename.split('-')
+        if len(parts) >= 7:
+            actor_id = int(parts[6].split('.')[0])
+            return f'Actor_{actor_id:02d}'
+    except:
+        pass
+    return 'Unknown'
+def scan_dataset_directory(data_dir: Optional[Path] = None) -> Tuple[List[Path], Optional[str]]:
+    """
+    Scan data directory and return list of audio files
+    Args:
+        data_dir: Path to dataset directory containing Actor_XX folders
+    Returns:
+        tuple: (list of audio file paths, error message or None)
+    """
+    if data_dir is None:
+        data_dir = config.DATA_DIR
+    data_path = Path(data_dir)
+    if not data_path.exists():
+        # Try alternative paths
+        alternative_paths = [
+            config.DATA_DIR,
+            Path('data/audio_speech_actors_01-24'),
+            Path('../data/RAVDESS/audio_speech_actors_01-24'),
+            Path('./RAVDESS/audio_speech_actors_01-24')
+        ]
+        for alt_path in alternative_paths:
+            if alt_path.exists():
+                data_path = alt_path
+                break
+    if not data_path.exists():
+        return [], f"❌ Dataset directory not found: {data_dir}"
+    # Find all Actor directories
+    actor_dirs = sorted([
+        d for d in data_path.iterdir()
+        if d.is_dir() and d.name.startswith('Actor_')
+    ])
+    if len(actor_dirs) == 0:
+        return [], f"❌ No Actor directories found in {data_path}"
+    # Collect all .wav files
+    audio_files = []
+    for actor_dir in actor_dirs:
+        wav_files = list(actor_dir.glob('*.wav'))
+        audio_files.extend(wav_files)
+    return audio_files, None
+def get_dataset_statistics(audio_files: List[Path]) -> dict:
+    """
+    Get statistics about the dataset
+    Args:
+        audio_files: List of audio file paths
+    Returns:
+        dict: Statistics dictionary
+    """
+    emotion_counts = {}
+    actor_set = set()
+    for audio_file in audio_files:
+        emotion = extract_emotion_from_filename(audio_file.name)
+        emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1
+        actor = extract_actor_from_filename(audio_file.name)
+        actor_set.add(actor)
+    return {
+        'total_files': len(audio_files),
+        'emotion_counts': emotion_counts,
+        'n_actors': len(actor_set),
+        'actors': sorted(list(actor_set))
+    }