speech-test commited on
Commit
8cd9d75
·
1 Parent(s): 137287d

Pure python evaluation

Browse files
Files changed (1) hide show
  1. README.md +10 -2
README.md CHANGED
@@ -80,11 +80,19 @@ tar -zxvf cv.tar.gz
80
  ```python
81
  import torch
82
  import torchaudio
 
 
83
  import pandas as pd
84
  from tqdm.auto import tqdm
85
- from datasets import load_dataset, load_metric
86
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
87
 
 
 
 
 
 
 
88
  wer = load_metric("wer")
89
 
90
  processor = Wav2Vec2Processor.from_pretrained("anton-l/wav2vec2-large-xlsr-53-chuvash")
@@ -97,7 +105,7 @@ clips_path = "cv-corpus-6.1-2020-12-11/cv/clips/"
97
  def clean_sentence(sent):
98
  sent = sent.lower()
99
  # replace non-alpha characters with space
100
- sent = "".join(ch if ch.isalpha() or ch == "'" else " " for ch in sent)
101
  # remove repeated spaces
102
  sent = " ".join(sent.split())
103
  return sent
 
80
  ```python
81
  import torch
82
  import torchaudio
83
+ import urllib.request
84
+ import tarfile
85
  import pandas as pd
86
  from tqdm.auto import tqdm
87
+ from datasets import load_metric
88
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
89
 
90
+ # Download the raw data instead of using HF datasets to save space
91
+ data_url = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/cv.tar.gz"
92
+ filestream = urllib.request.urlopen(data_url)
93
+ data_file = tarfile.open(fileobj=filestream, mode="r|gz")
94
+ data_file.extractall()
95
+
96
  wer = load_metric("wer")
97
 
98
  processor = Wav2Vec2Processor.from_pretrained("anton-l/wav2vec2-large-xlsr-53-chuvash")
 
105
  def clean_sentence(sent):
106
  sent = sent.lower()
107
  # replace non-alpha characters with space
108
+ sent = "".join(ch if ch.isalpha() else " " for ch in sent)
109
  # remove repeated spaces
110
  sent = " ".join(sent.split())
111
  return sent