nguyennp86 commited on
Commit
05bc8c1
·
verified ·
1 Parent(s): bde3617

Create data_loader.py

Browse files
Files changed (1) hide show
  1. src/data_loader.py +115 -0
src/data_loader.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data loading and dataset scanning utilities
3
+ """
4
+
5
+ from pathlib import Path
6
+ from typing import List, Tuple, Optional
7
+ import config
8
+
9
+ def extract_emotion_from_filename(filename: str) -> str:
10
+ """Extract emotion from RAVDESS-style filename"""
11
+ try:
12
+ parts = filename.split('-')
13
+ if len(parts) >= 3:
14
+ emotion_code = int(parts[2])
15
+ return config.EMOTION_MAP.get(emotion_code, 'unknown')
16
+ except:
17
+ pass
18
+
19
+ # Fallback: Check filename for emotion keywords
20
+ filename_lower = filename.lower()
21
+ for emotion in config.EMOTION_MAP.values():
22
+ if emotion in filename_lower:
23
+ return emotion
24
+
25
+ return 'unknown'
26
+
27
+
28
+ def extract_actor_from_filename(filename: str) -> str:
29
+ """Extract actor ID from filename"""
30
+ try:
31
+ parts = filename.split('-')
32
+ if len(parts) >= 7:
33
+ actor_id = int(parts[6].split('.')[0])
34
+ return f'Actor_{actor_id:02d}'
35
+ except:
36
+ pass
37
+ return 'Unknown'
38
+
39
+
40
+ def scan_dataset_directory(data_dir: Optional[Path] = None) -> Tuple[List[Path], Optional[str]]:
41
+ """
42
+ Scan data directory and return list of audio files
43
+
44
+ Args:
45
+ data_dir: Path to dataset directory containing Actor_XX folders
46
+
47
+ Returns:
48
+ tuple: (list of audio file paths, error message or None)
49
+ """
50
+ if data_dir is None:
51
+ data_dir = config.DATA_DIR
52
+
53
+ data_path = Path(data_dir)
54
+
55
+ if not data_path.exists():
56
+ # Try alternative paths
57
+ alternative_paths = [
58
+ config.DATA_DIR,
59
+ Path('data/audio_speech_actors_01-24'),
60
+ Path('../data/RAVDESS/audio_speech_actors_01-24'),
61
+ Path('./RAVDESS/audio_speech_actors_01-24')
62
+ ]
63
+
64
+ for alt_path in alternative_paths:
65
+ if alt_path.exists():
66
+ data_path = alt_path
67
+ break
68
+
69
+ if not data_path.exists():
70
+ return [], f"❌ Dataset directory not found: {data_dir}"
71
+
72
+ # Find all Actor directories
73
+ actor_dirs = sorted([
74
+ d for d in data_path.iterdir()
75
+ if d.is_dir() and d.name.startswith('Actor_')
76
+ ])
77
+
78
+ if len(actor_dirs) == 0:
79
+ return [], f"❌ No Actor directories found in {data_path}"
80
+
81
+ # Collect all .wav files
82
+ audio_files = []
83
+ for actor_dir in actor_dirs:
84
+ wav_files = list(actor_dir.glob('*.wav'))
85
+ audio_files.extend(wav_files)
86
+
87
+ return audio_files, None
88
+
89
+
90
+ def get_dataset_statistics(audio_files: List[Path]) -> dict:
91
+ """
92
+ Get statistics about the dataset
93
+
94
+ Args:
95
+ audio_files: List of audio file paths
96
+
97
+ Returns:
98
+ dict: Statistics dictionary
99
+ """
100
+ emotion_counts = {}
101
+ actor_set = set()
102
+
103
+ for audio_file in audio_files:
104
+ emotion = extract_emotion_from_filename(audio_file.name)
105
+ emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1
106
+
107
+ actor = extract_actor_from_filename(audio_file.name)
108
+ actor_set.add(actor)
109
+
110
+ return {
111
+ 'total_files': len(audio_files),
112
+ 'emotion_counts': emotion_counts,
113
+ 'n_actors': len(actor_set),
114
+ 'actors': sorted(list(actor_set))
115
+ }