Update moroccan dialect model card.
Browse files
README.md
CHANGED
@@ -55,7 +55,7 @@ model = Wav2Vec2ForCTC.from_pretrained("othrif/wav2vec2-large-xlsr-moroccan")
|
|
55 |
model.to("cuda")
|
56 |
|
57 |
|
58 |
-
chars_to_ignore_regex = '[
|
59 |
|
60 |
def remove_special_characters(batch):
|
61 |
batch["text"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).lower() + " "
|
@@ -105,67 +105,60 @@ The model can be evaluated as follows on the Arabic test data of Common Voice.
|
|
105 |
|
106 |
|
107 |
```python
|
|
|
108 |
import torch
|
|
|
109 |
import torchaudio
|
110 |
from datasets import load_dataset, load_metric
|
111 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
112 |
-
import
|
113 |
|
114 |
-
|
115 |
wer = load_metric("wer")
|
116 |
|
117 |
-
processor = Wav2Vec2Processor.from_pretrained("othrif/wav2vec2-large-xlsr-moroccan")
|
118 |
model = Wav2Vec2ForCTC.from_pretrained("othrif/wav2vec2-large-xlsr-moroccan")
|
119 |
model.to("cuda")
|
120 |
|
121 |
-
chars_to_ignore_regex = '[
|
122 |
-
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
-
# Preprocessing the datasets.
|
126 |
-
# We need to read the audio files as arrays
|
127 |
def speech_file_to_array_fn(batch):
|
128 |
-
|
129 |
-
batch["text"] = re.sub('[a-zA-z]', '', batch["text"]).lower() + " "
|
130 |
-
batch["text"] = re.sub('[ًٌٍَُِ~]', '', batch["text"]).lower() + " "
|
131 |
-
|
132 |
-
# batch["text"] = re.sub('\\\\\\
|
133 |
-
','', batch["text"])
|
134 |
-
batch["text"] = re.sub("[إأٱآا]", "ا", batch["text"])
|
135 |
-
batch["text"] = re.sub("ڸ", "ل", batch["text"])
|
136 |
-
noise = re.compile(""" ّ | # Tashdid
|
137 |
-
َ | # Fatha
|
138 |
-
ً | # Tanwin Fath
|
139 |
-
ُ | # Damma
|
140 |
-
ٌ | # Tanwin Damm
|
141 |
-
ِ | # Kasra
|
142 |
-
ٍ | # Tanwin Kasr
|
143 |
-
ْ | # Sukun
|
144 |
-
ـ # Tatwil/Kashida
|
145 |
-
""", re.VERBOSE)
|
146 |
-
batch["text"] = re.sub(noise, '', batch["text"])
|
147 |
-
batch["text"] = re.sub('ٖ', '', batch["text"]).lower() + " "
|
148 |
speech_array, sampling_rate = torchaudio.load(batch["path"])
|
149 |
-
batch["
|
|
|
|
|
|
|
|
|
150 |
return batch
|
151 |
|
152 |
-
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
153 |
|
154 |
-
|
155 |
-
|
|
|
|
|
|
|
156 |
def evaluate(batch):
|
157 |
inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
|
158 |
|
159 |
with torch.no_grad():
|
160 |
-
|
161 |
|
162 |
pred_ids = torch.argmax(logits, dim=-1)
|
163 |
batch["pred_strings"] = processor.batch_decode(pred_ids)
|
164 |
return batch
|
165 |
|
166 |
-
result =
|
167 |
|
168 |
-
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["
|
169 |
```
|
170 |
|
171 |
**Test Result**: 66.45
|
|
|
55 |
model.to("cuda")
|
56 |
|
57 |
|
58 |
+
chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\'\\�]'
|
59 |
|
60 |
def remove_special_characters(batch):
|
61 |
batch["text"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).lower() + " "
|
|
|
105 |
|
106 |
|
107 |
```python
|
108 |
+
import re
|
109 |
import torch
|
110 |
+
import librosa
|
111 |
import torchaudio
|
112 |
from datasets import load_dataset, load_metric
|
113 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
114 |
+
import soundfile as sf
|
115 |
|
116 |
+
eval_dataset = load_dataset("ma_speech_corpus", split="test")
|
117 |
wer = load_metric("wer")
|
118 |
|
119 |
+
processor = Wav2Vec2Processor.from_pretrained("othrif/wav2vec2-large-xlsr-moroccan")
|
120 |
model = Wav2Vec2ForCTC.from_pretrained("othrif/wav2vec2-large-xlsr-moroccan")
|
121 |
model.to("cuda")
|
122 |
|
123 |
+
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\'\�]'
|
124 |
+
|
125 |
+
def remove_special_characters(batch):
|
126 |
+
batch["text"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).lower() + " "
|
127 |
+
return batch
|
128 |
+
|
129 |
+
|
130 |
+
eval_dataset = eval_dataset.map(remove_special_characters, remove_columns=["sentence"])
|
131 |
+
#eval_dataset = eval_dataset.select(range(100))
|
132 |
|
|
|
|
|
133 |
def speech_file_to_array_fn(batch):
|
134 |
+
start, stop = batch['segment'].split('_')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
speech_array, sampling_rate = torchaudio.load(batch["path"])
|
136 |
+
speech_array, sampling_rate = sf.read(batch["path"], start=int(float(start) * sampling_rate),
|
137 |
+
stop=int(float(stop) * sampling_rate))
|
138 |
+
batch["speech"] = librosa.resample(speech_array, sampling_rate, 16_000)
|
139 |
+
batch["sampling_rate"] = 16_000
|
140 |
+
batch["target_text"] = batch["text"]
|
141 |
return batch
|
142 |
|
|
|
143 |
|
144 |
+
eval_dataset = eval_dataset.map(
|
145 |
+
speech_file_to_array_fn,
|
146 |
+
remove_columns=eval_dataset.column_names
|
147 |
+
)
|
148 |
+
|
149 |
def evaluate(batch):
|
150 |
inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
|
151 |
|
152 |
with torch.no_grad():
|
153 |
+
logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
|
154 |
|
155 |
pred_ids = torch.argmax(logits, dim=-1)
|
156 |
batch["pred_strings"] = processor.batch_decode(pred_ids)
|
157 |
return batch
|
158 |
|
159 |
+
result = eval_dataset.map(evaluate, batched=True, batch_size=32)
|
160 |
|
161 |
+
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["target_text"])))
|
162 |
```
|
163 |
|
164 |
**Test Result**: 66.45
|