BeebekBhz commited on
Commit
30a514c
·
verified ·
1 Parent(s): 45c076b

Uploading wav2vec2-large-xlsr-53-english model

Browse files
.gitattributes CHANGED
@@ -1,35 +1,19 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
4
  *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.arrow filter=lfs diff=lfs merge=lfs -text
10
+ *.ftz filter=lfs diff=lfs merge=lfs -text
11
  *.joblib filter=lfs diff=lfs merge=lfs -text
 
 
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
14
  *.pb filter=lfs diff=lfs merge=lfs -text
 
 
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
18
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
19
+ language_model/lm.binary filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,3 +1,165 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ datasets:
4
+ - common_voice
5
+ - mozilla-foundation/common_voice_6_0
6
+ metrics:
7
+ - wer
8
+ - cer
9
+ tags:
10
+ - audio
11
+ - automatic-speech-recognition
12
+ - en
13
+ - hf-asr-leaderboard
14
+ - mozilla-foundation/common_voice_6_0
15
+ - robust-speech-event
16
+ - speech
17
+ - xlsr-fine-tuning-week
18
+ license: apache-2.0
19
+ model-index:
20
+ - name: XLSR Wav2Vec2 English by Jonatas Grosman
21
+ results:
22
+ - task:
23
+ name: Automatic Speech Recognition
24
+ type: automatic-speech-recognition
25
+ dataset:
26
+ name: Common Voice en
27
+ type: common_voice
28
+ args: en
29
+ metrics:
30
+ - name: Test WER
31
+ type: wer
32
+ value: 19.06
33
+ - name: Test CER
34
+ type: cer
35
+ value: 7.69
36
+ - name: Test WER (+LM)
37
+ type: wer
38
+ value: 14.81
39
+ - name: Test CER (+LM)
40
+ type: cer
41
+ value: 6.84
42
+ - task:
43
+ name: Automatic Speech Recognition
44
+ type: automatic-speech-recognition
45
+ dataset:
46
+ name: Robust Speech Event - Dev Data
47
+ type: speech-recognition-community-v2/dev_data
48
+ args: en
49
+ metrics:
50
+ - name: Dev WER
51
+ type: wer
52
+ value: 27.72
53
+ - name: Dev CER
54
+ type: cer
55
+ value: 11.65
56
+ - name: Dev WER (+LM)
57
+ type: wer
58
+ value: 20.85
59
+ - name: Dev CER (+LM)
60
+ type: cer
61
+ value: 11.01
62
+ ---
63
+
64
+ # Fine-tuned XLSR-53 large model for speech recognition in English
65
+
66
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on English using the train and validation splits of [Common Voice 6.1](https://huggingface.co/datasets/common_voice).
67
+ When using this model, make sure that your speech input is sampled at 16kHz.
68
+
69
+ This model has been fine-tuned thanks to the GPU credits generously given by the [OVHcloud](https://www.ovhcloud.com/en/public-cloud/ai-training/) :)
70
+
71
+ The script used for training can be found here: https://github.com/jonatasgrosman/wav2vec2-sprint
72
+
73
+ ## Usage
74
+
75
+ The model can be used directly (without a language model) as follows...
76
+
77
+ Using the [HuggingSound](https://github.com/jonatasgrosman/huggingsound) library:
78
+
79
+ ```python
80
+ from huggingsound import SpeechRecognitionModel
81
+
82
+ model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english")
83
+ audio_paths = ["/path/to/file.mp3", "/path/to/another_file.wav"]
84
+
85
+ transcriptions = model.transcribe(audio_paths)
86
+ ```
87
+
88
+ Writing your own inference script:
89
+
90
+ ```python
91
+ import torch
92
+ import librosa
93
+ from datasets import load_dataset
94
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
95
+
96
+ LANG_ID = "en"
97
+ MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
98
+ SAMPLES = 10
99
+
100
+ test_dataset = load_dataset("common_voice", LANG_ID, split=f"test[:{SAMPLES}]")
101
+
102
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
103
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
104
+
105
+ # Preprocessing the datasets.
106
+ # We need to read the audio files as arrays
107
+ def speech_file_to_array_fn(batch):
108
+ speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
109
+ batch["speech"] = speech_array
110
+ batch["sentence"] = batch["sentence"].upper()
111
+ return batch
112
+
113
+ test_dataset = test_dataset.map(speech_file_to_array_fn)
114
+ inputs = processor(test_dataset["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
115
+
116
+ with torch.no_grad():
117
+ logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
118
+
119
+ predicted_ids = torch.argmax(logits, dim=-1)
120
+ predicted_sentences = processor.batch_decode(predicted_ids)
121
+
122
+ for i, predicted_sentence in enumerate(predicted_sentences):
123
+ print("-" * 100)
124
+ print("Reference:", test_dataset[i]["sentence"])
125
+ print("Prediction:", predicted_sentence)
126
+ ```
127
+
128
+ | Reference | Prediction |
129
+ | ------------- | ------------- |
130
+ | "SHE'LL BE ALL RIGHT." | SHE'LL BE ALL RIGHT |
131
+ | SIX | SIX |
132
+ | "ALL'S WELL THAT ENDS WELL." | ALL AS WELL THAT ENDS WELL |
133
+ | DO YOU MEAN IT? | DO YOU MEAN IT |
134
+ | THE NEW PATCH IS LESS INVASIVE THAN THE OLD ONE, BUT STILL CAUSES REGRESSIONS. | THE NEW PATCH IS LESS INVASIVE THAN THE OLD ONE BUT STILL CAUSES REGRESSION |
135
+ | HOW IS MOZILLA GOING TO HANDLE AMBIGUITIES LIKE QUEUE AND CUE? | HOW IS MOSLILLAR GOING TO HANDLE ANDBEWOOTH HIS LIKE Q AND Q |
136
+ | "I GUESS YOU MUST THINK I'M KINDA BATTY." | RUSTIAN WASTIN PAN ONTE BATTLY |
137
+ | NO ONE NEAR THE REMOTE MACHINE YOU COULD RING? | NO ONE NEAR THE REMOTE MACHINE YOU COULD RING |
138
+ | SAUCE FOR THE GOOSE IS SAUCE FOR THE GANDER. | SAUCE FOR THE GUICE IS SAUCE FOR THE GONDER |
139
+ | GROVES STARTED WRITING SONGS WHEN SHE WAS FOUR YEARS OLD. | GRAFS STARTED WRITING SONGS WHEN SHE WAS FOUR YEARS OLD |
140
+
141
+ ## Evaluation
142
+
143
+ 1. To evaluate on `mozilla-foundation/common_voice_6_0` with split `test`
144
+
145
+ ```bash
146
+ python eval.py --model_id jonatasgrosman/wav2vec2-large-xlsr-53-english --dataset mozilla-foundation/common_voice_6_0 --config en --split test
147
+ ```
148
+
149
+ 2. To evaluate on `speech-recognition-community-v2/dev_data`
150
+
151
+ ```bash
152
+ python eval.py --model_id jonatasgrosman/wav2vec2-large-xlsr-53-english --dataset speech-recognition-community-v2/dev_data --config en --split validation --chunk_length_s 5.0 --stride_length_s 1.0
153
+ ```
154
+
155
+ ## Citation
156
+ If you want to cite this model you can use this:
157
+
158
+ ```bibtex
159
+ @misc{grosman2021xlsr53-large-english,
160
+ title={Fine-tuned {XLSR}-53 large model for speech recognition in {E}nglish},
161
+ author={Grosman, Jonatas},
162
+ howpublished={\url{https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-english}},
163
+ year={2021}
164
+ }
165
+ ```
alphabet.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"labels": ["", "<s>", "</s>", "⁇", " ", "'", "-", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"], "is_bpe": false}
config.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
3
+ "activation_dropout": 0.05,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "Wav2Vec2ForCTC"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "conv_bias": true,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_stride": [
30
+ 5,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2
37
+ ],
38
+ "ctc_loss_reduction": "mean",
39
+ "ctc_zero_infinity": true,
40
+ "do_stable_layer_norm": true,
41
+ "eos_token_id": 2,
42
+ "feat_extract_activation": "gelu",
43
+ "feat_extract_dropout": 0.0,
44
+ "feat_extract_norm": "layer",
45
+ "feat_proj_dropout": 0.05,
46
+ "final_dropout": 0.0,
47
+ "hidden_act": "gelu",
48
+ "hidden_dropout": 0.05,
49
+ "hidden_size": 1024,
50
+ "initializer_range": 0.02,
51
+ "intermediate_size": 4096,
52
+ "layer_norm_eps": 1e-05,
53
+ "layerdrop": 0.05,
54
+ "mask_channel_length": 10,
55
+ "mask_channel_min_space": 1,
56
+ "mask_channel_other": 0.0,
57
+ "mask_channel_prob": 0.0,
58
+ "mask_channel_selection": "static",
59
+ "mask_feature_length": 10,
60
+ "mask_feature_prob": 0.0,
61
+ "mask_time_length": 10,
62
+ "mask_time_min_space": 1,
63
+ "mask_time_other": 0.0,
64
+ "mask_time_prob": 0.05,
65
+ "mask_time_selection": "static",
66
+ "model_type": "wav2vec2",
67
+ "num_attention_heads": 16,
68
+ "num_conv_pos_embedding_groups": 16,
69
+ "num_conv_pos_embeddings": 128,
70
+ "num_feat_extract_layers": 7,
71
+ "num_hidden_layers": 24,
72
+ "pad_token_id": 0,
73
+ "transformers_version": "4.7.0.dev0",
74
+ "vocab_size": 33
75
+ }
eval.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from datasets import load_dataset, load_metric, Audio, Dataset
3
+ from transformers import pipeline, AutoFeatureExtractor, AutoTokenizer, AutoConfig, AutoModelForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
4
+ import re
5
+ import torch
6
+ import argparse
7
+ from typing import Dict
8
+
9
+ def log_results(result: Dataset, args: Dict[str, str]):
10
+ """ DO NOT CHANGE. This function computes and logs the result metrics. """
11
+
12
+ log_outputs = args.log_outputs
13
+ dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
14
+
15
+ # load metric
16
+ wer = load_metric("wer")
17
+ cer = load_metric("cer")
18
+
19
+ # compute metrics
20
+ wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
21
+ cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
22
+
23
+ # print & log results
24
+ result_str = (
25
+ f"WER: {wer_result}\n"
26
+ f"CER: {cer_result}"
27
+ )
28
+ print(result_str)
29
+
30
+ with open(f"{dataset_id}_eval_results.txt", "w") as f:
31
+ f.write(result_str)
32
+
33
+ # log all results in text file. Possibly interesting for analysis
34
+ if log_outputs is not None:
35
+ pred_file = f"log_{dataset_id}_predictions.txt"
36
+ target_file = f"log_{dataset_id}_targets.txt"
37
+
38
+ with open(pred_file, "w") as p, open(target_file, "w") as t:
39
+
40
+ # mapping function to write output
41
+ def write_to_file(batch, i):
42
+ p.write(f"{i}" + "\n")
43
+ p.write(batch["prediction"] + "\n")
44
+ t.write(f"{i}" + "\n")
45
+ t.write(batch["target"] + "\n")
46
+
47
+ result.map(write_to_file, with_indices=True)
48
+
49
+
50
+ def normalize_text(text: str, invalid_chars_regex: str, to_lower: bool) -> str:
51
+ """ DO ADAPT FOR YOUR USE CASE. this function normalizes the target text. """
52
+
53
+ text = text.lower() if to_lower else text.upper()
54
+
55
+ text = re.sub(invalid_chars_regex, " ", text)
56
+
57
+ text = re.sub("\s+", " ", text).strip()
58
+
59
+ return text
60
+
61
+
62
+ def main(args):
63
+ # load dataset
64
+ dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
65
+
66
+ # for testing: only process the first two examples as a test
67
+ # dataset = dataset.select(range(10))
68
+
69
+ # load processor
70
+ if args.greedy:
71
+ processor = Wav2Vec2Processor.from_pretrained(args.model_id)
72
+ decoder = None
73
+ else:
74
+ processor = Wav2Vec2ProcessorWithLM.from_pretrained(args.model_id)
75
+ decoder = processor.decoder
76
+
77
+ feature_extractor = processor.feature_extractor
78
+ tokenizer = processor.tokenizer
79
+
80
+ # resample audio
81
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
82
+
83
+ # load eval pipeline
84
+ if args.device is None:
85
+ args.device = 0 if torch.cuda.is_available() else -1
86
+
87
+ config = AutoConfig.from_pretrained(args.model_id)
88
+ model = AutoModelForCTC.from_pretrained(args.model_id)
89
+
90
+ #asr = pipeline("automatic-speech-recognition", model=args.model_id, device=args.device)
91
+ asr = pipeline("automatic-speech-recognition", config=config, model=model, tokenizer=tokenizer,
92
+ feature_extractor=feature_extractor, decoder=decoder, device=args.device)
93
+
94
+ # build normalizer config
95
+ tokenizer = AutoTokenizer.from_pretrained(args.model_id)
96
+ tokens = [x for x in tokenizer.convert_ids_to_tokens(range(0, tokenizer.vocab_size))]
97
+ special_tokens = [
98
+ tokenizer.pad_token, tokenizer.word_delimiter_token,
99
+ tokenizer.unk_token, tokenizer.bos_token,
100
+ tokenizer.eos_token,
101
+ ]
102
+ non_special_tokens = [x for x in tokens if x not in special_tokens]
103
+ invalid_chars_regex = f"[^\s{re.escape(''.join(set(non_special_tokens)))}]"
104
+ normalize_to_lower = False
105
+ for token in non_special_tokens:
106
+ if token.isalpha() and token.islower():
107
+ normalize_to_lower = True
108
+ break
109
+
110
+ # map function to decode audio
111
+ def map_to_pred(batch, args=args, asr=asr, invalid_chars_regex=invalid_chars_regex, normalize_to_lower=normalize_to_lower):
112
+ prediction = asr(batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s)
113
+
114
+ batch["prediction"] = prediction["text"]
115
+ batch["target"] = normalize_text(batch["sentence"], invalid_chars_regex, normalize_to_lower)
116
+ return batch
117
+
118
+ # run inference on all examples
119
+ result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
120
+
121
+ # filtering out empty targets
122
+ result = result.filter(lambda example: example["target"] != "")
123
+
124
+ # compute and log_results
125
+ # do not change function below
126
+ log_results(result, args)
127
+
128
+
129
+ if __name__ == "__main__":
130
+ parser = argparse.ArgumentParser()
131
+
132
+ parser.add_argument(
133
+ "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
134
+ )
135
+ parser.add_argument(
136
+ "--dataset", type=str, required=True, help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets"
137
+ )
138
+ parser.add_argument(
139
+ "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
140
+ )
141
+ parser.add_argument(
142
+ "--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`"
143
+ )
144
+ parser.add_argument(
145
+ "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to None. For long audio files a good value would be 5.0 seconds."
146
+ )
147
+ parser.add_argument(
148
+ "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to None. For long audio files a good value would be 1.0 seconds."
149
+ )
150
+ parser.add_argument(
151
+ "--log_outputs", action='store_true', help="If defined, write outputs to log file for analysis."
152
+ )
153
+ parser.add_argument(
154
+ "--greedy", action='store_true', help="If defined, the LM will be ignored during inference."
155
+ )
156
+ parser.add_argument(
157
+ "--device",
158
+ type=int,
159
+ default=None,
160
+ help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
161
+ )
162
+ args = parser.parse_args()
163
+
164
+ main(args)
flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d3842440388a575e19f19cdb05714afd7018392bd1a7e247c601530a653aa40
3
+ size 1261905572
full_eval.sh ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CV - TEST
2
+
3
+ python eval.py --model_id jonatasgrosman/wav2vec2-large-xlsr-53-english --dataset mozilla-foundation/common_voice_6_0 --config en --split test --log_outputs --greedy
4
+ mv log_mozilla-foundation_common_voice_6_0_en_test_predictions.txt log_mozilla-foundation_common_voice_6_0_en_test_predictions_greedy.txt
5
+ mv mozilla-foundation_common_voice_6_0_en_test_eval_results.txt mozilla-foundation_common_voice_6_0_en_test_eval_results_greedy.txt
6
+
7
+ python eval.py --model_id jonatasgrosman/wav2vec2-large-xlsr-53-english --dataset mozilla-foundation/common_voice_6_0 --config en --split test --log_outputs
8
+
9
+ # HF EVENT - DEV
10
+
11
+ python eval.py --model_id jonatasgrosman/wav2vec2-large-xlsr-53-english --dataset speech-recognition-community-v2/dev_data --config en --split validation --chunk_length_s 5.0 --stride_length_s 1.0 --log_outputs --greedy
12
+ mv log_speech-recognition-community-v2_dev_data_en_validation_predictions.txt log_speech-recognition-community-v2_dev_data_en_validation_predictions_greedy.txt
13
+ mv speech-recognition-community-v2_dev_data_en_validation_eval_results.txt speech-recognition-community-v2_dev_data_en_validation_eval_results_greedy.txt
14
+
15
+ python eval.py --model_id jonatasgrosman/wav2vec2-large-xlsr-53-english --dataset speech-recognition-community-v2/dev_data --config en --split validation --chunk_length_s 5.0 --stride_length_s 1.0 --log_outputs
language_model/attrs.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"alpha": 0.5, "beta": 1.5, "unk_score_offset": -10.0, "score_boundary": true}
language_model/lm.binary ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47e16abf6384ebd1b3395144330b60710dc43f3d16c4b2b4794071cd117230e5
3
+ size 862913451
language_model/unigrams.txt ADDED
The diff for this file is too large to render. See raw diff
 
log_mozilla-foundation_common_voice_6_0_en_test_predictions.txt ADDED
The diff for this file is too large to render. See raw diff
 
log_mozilla-foundation_common_voice_6_0_en_test_predictions_greedy.txt ADDED
The diff for this file is too large to render. See raw diff
 
log_mozilla-foundation_common_voice_6_0_en_test_targets.txt ADDED
The diff for this file is too large to render. See raw diff
 
log_speech-recognition-community-v2_dev_data_en_validation_predictions.txt ADDED
The diff for this file is too large to render. See raw diff
 
log_speech-recognition-community-v2_dev_data_en_validation_predictions_greedy.txt ADDED
The diff for this file is too large to render. See raw diff
 
log_speech-recognition-community-v2_dev_data_en_validation_targets.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6144f8464c6aaa220dd57c5a2ad4039b5710dcf8ee6e67057675f76597c19875
3
+ size 1261942732
mozilla-foundation_common_voice_6_0_en_test_eval_results.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ WER: 0.1481828839390387
2
+ CER: 0.06848087313203592
mozilla-foundation_common_voice_6_0_en_test_eval_results_greedy.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ WER: 0.19067492882264278
2
+ CER: 0.07694957927516068
preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000,
9
+ "processor_class": "Wav2Vec2ProcessorWithLM"
10
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b7688644eeefe1f5760bb4c4a61d085793a3740159fdbf19fd37c5d4f3729bf
3
+ size 1262069143
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
speech-recognition-community-v2_dev_data_en_validation_eval_results.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ WER: 0.2085057090848916
2
+ CER: 0.11011805154105943
speech-recognition-community-v2_dev_data_en_validation_eval_results_greedy.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ WER: 0.27722157868608305
2
+ CER: 0.11652265190008215
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "|": 4, "'": 5, "-": 6, "a": 7, "b": 8, "c": 9, "d": 10, "e": 11, "f": 12, "g": 13, "h": 14, "i": 15, "j": 16, "k": 17, "l": 18, "m": 19, "n": 20, "o": 21, "p": 22, "q": 23, "r": 24, "s": 25, "t": 26, "u": 27, "v": 28, "w": 29, "x": 30, "y": 31, "z": 32}