Commit
·
2de8373
1
Parent(s):
280cb76
added fleurs
Browse files
run_speech_recognition_seq2seq_streaming.py
CHANGED
@@ -358,14 +358,13 @@ def main():
|
|
358 |
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
|
359 |
common_voice = common_voice.remove_columns(set(common_voice.features.keys()) - set(["audio", "sentence"]))
|
360 |
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
|
366 |
-
|
367 |
-
|
368 |
-
raw_datasets["train"] = common_voice
|
369 |
|
370 |
"""
|
371 |
raw_datasets["train"] = load_maybe_streaming_dataset(
|
|
|
358 |
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
|
359 |
common_voice = common_voice.remove_columns(set(common_voice.features.keys()) - set(["audio", "sentence"]))
|
360 |
|
361 |
+
fleurs = load_maybe_streaming_dataset("google/fleurs", "el_gr", split="train+validation+test")
|
362 |
+
fleurs = fleurs.cast_column("audio", Audio(sampling_rate=16000))
|
363 |
+
fleurs = fleurs.rename_column("raw_transcription", "sentence")
|
364 |
+
fleurs = fleurs.remove_columns(set(fleurs.features.keys()) - set(["audio", "sentence"]))
|
365 |
|
366 |
+
all_datasets = [common_voice, fleurs]
|
367 |
+
raw_datasets["train"] = interleave_datasets(all_datasets, stopping_strategy="all_exhausted")
|
|
|
368 |
|
369 |
"""
|
370 |
raw_datasets["train"] = load_maybe_streaming_dataset(
|