""" Data loading and dataset scanning utilities """ from pathlib import Path from typing import List, Tuple, Optional import config def extract_emotion_from_filename(filename: str) -> str: """Extract emotion from RAVDESS-style filename""" try: parts = filename.split('-') if len(parts) >= 3: emotion_code = int(parts[2]) return config.EMOTION_MAP.get(emotion_code, 'unknown') except: pass # Fallback: Check filename for emotion keywords filename_lower = filename.lower() for emotion in config.EMOTION_MAP.values(): if emotion in filename_lower: return emotion return 'unknown' def extract_actor_from_filename(filename: str) -> str: """Extract actor ID from filename""" try: parts = filename.split('-') if len(parts) >= 7: actor_id = int(parts[6].split('.')[0]) return f'Actor_{actor_id:02d}' except: pass return 'Unknown' def scan_dataset_directory(data_dir: Optional[Path] = None) -> Tuple[List[Path], Optional[str]]: """ Scan data directory and return list of audio files Args: data_dir: Path to dataset directory containing Actor_XX folders Returns: tuple: (list of audio file paths, error message or None) """ if data_dir is None: data_dir = config.DATA_DIR data_path = Path(data_dir) if not data_path.exists(): # Try alternative paths alternative_paths = [ config.DATA_DIR, Path('data/audio_speech_actors_01-24'), Path('../data/RAVDESS/audio_speech_actors_01-24'), Path('./RAVDESS/audio_speech_actors_01-24') ] for alt_path in alternative_paths: if alt_path.exists(): data_path = alt_path break if not data_path.exists(): return [], f"❌ Dataset directory not found: {data_dir}" # Find all Actor directories actor_dirs = sorted([ d for d in data_path.iterdir() if d.is_dir() and d.name.startswith('Actor_') ]) if len(actor_dirs) == 0: return [], f"❌ No Actor directories found in {data_path}" # Collect all .wav files audio_files = [] for actor_dir in actor_dirs: wav_files = list(actor_dir.glob('*.wav')) audio_files.extend(wav_files) return audio_files, None def get_dataset_statistics(audio_files: List[Path]) -> dict: """ Get statistics about the dataset Args: audio_files: List of audio file paths Returns: dict: Statistics dictionary """ emotion_counts = {} actor_set = set() for audio_file in audio_files: emotion = extract_emotion_from_filename(audio_file.name) emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1 actor = extract_actor_from_filename(audio_file.name) actor_set.add(actor) return { 'total_files': len(audio_files), 'emotion_counts': emotion_counts, 'n_actors': len(actor_set), 'actors': sorted(list(actor_set)) }