add CV 10
Browse files- .gitattributes +0 -0
- .gitignore +0 -0
- README.template.md +35 -3
- dataset_script.py +3 -3
- generate_datasets.py +5 -0
- languages.ftl +13 -1
- publish.py +3 -0
- test.py +1 -1
.gitattributes
CHANGED
|
File without changes
|
.gitignore
CHANGED
|
File without changes
|
README.template.md
CHANGED
|
@@ -4,9 +4,9 @@ annotations_creators:
|
|
| 4 |
- crowdsourced
|
| 5 |
language_creators:
|
| 6 |
- crowdsourced
|
| 7 |
-
|
| 8 |
{{LANGUAGES}}
|
| 9 |
-
|
| 10 |
- cc0-1.0
|
| 11 |
multilinguality:
|
| 12 |
- multilingual
|
|
@@ -68,7 +68,7 @@ Take a look at the [Languages](https://commonvoice.mozilla.org/en/languages) pag
|
|
| 68 |
### Supported Tasks and Leaderboards
|
| 69 |
|
| 70 |
The results for models trained on the Common Voice datasets are available via the
|
| 71 |
-
[
|
| 72 |
|
| 73 |
### Languages
|
| 74 |
|
|
@@ -142,6 +142,38 @@ The other data is data that has not yet been reviewed.
|
|
| 142 |
|
| 143 |
The dev, test, train are all data that has been reviewed, deemed of high quality and split into dev, test and train.
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
## Dataset Creation
|
| 146 |
|
| 147 |
### Curation Rationale
|
|
|
|
| 4 |
- crowdsourced
|
| 5 |
language_creators:
|
| 6 |
- crowdsourced
|
| 7 |
+
language_bcp47:
|
| 8 |
{{LANGUAGES}}
|
| 9 |
+
license:
|
| 10 |
- cc0-1.0
|
| 11 |
multilinguality:
|
| 12 |
- multilingual
|
|
|
|
| 68 |
### Supported Tasks and Leaderboards
|
| 69 |
|
| 70 |
The results for models trained on the Common Voice datasets are available via the
|
| 71 |
+
[🤗 Speech Bench](https://huggingface.co/spaces/huggingface/hf-speech-bench)
|
| 72 |
|
| 73 |
### Languages
|
| 74 |
|
|
|
|
| 142 |
|
| 143 |
The dev, test, train are all data that has been reviewed, deemed of high quality and split into dev, test and train.
|
| 144 |
|
| 145 |
+
## Data Preprocessing Recommended by Hugging Face
|
| 146 |
+
|
| 147 |
+
The following are data preprocessing steps advised by the Hugging Face team. They are accompanied by an example code snippet that shows how to put them to practice.
|
| 148 |
+
|
| 149 |
+
Many examples in this dataset have trailing quotations marks, e.g _“the cat sat on the mat.“_. These trailing quotation marks do not change the actual meaning of the sentence, and it is near impossible to infer whether a sentence is a quotation or not a quotation from audio data alone. In these cases, it is advised to strip the quotation marks, leaving: _the cat sat on the mat_.
|
| 150 |
+
|
| 151 |
+
In addition, the majority of training sentences end in punctuation ( . or ? or ! ), whereas just a small proportion do not. In the dev set, **almost all** sentences end in punctuation. Thus, it is recommended to append a full-stop ( . ) to the end of the small number of training examples that do not end in punctuation.
|
| 152 |
+
|
| 153 |
+
```python
|
| 154 |
+
from datasets import load_dataset
|
| 155 |
+
|
| 156 |
+
ds = load_dataset("mozilla-foundation/{{NAME}}", "en", use_auth_token=True)
|
| 157 |
+
|
| 158 |
+
def prepare_dataset(batch):
|
| 159 |
+
"""Function to preprocess the dataset with the .map method"""
|
| 160 |
+
transcription = batch["sentence"]
|
| 161 |
+
|
| 162 |
+
if transcription.startswith('"') and transcription.endswith('"'):
|
| 163 |
+
# we can remove trailing quotation marks as they do not affect the transcription
|
| 164 |
+
transcription = transcription[1:-1]
|
| 165 |
+
|
| 166 |
+
if transcription[-1] not in [".", "?", "!"]:
|
| 167 |
+
# append a full-stop to sentences that do not end in punctuation
|
| 168 |
+
transcription = transcription + "."
|
| 169 |
+
|
| 170 |
+
batch["sentence"] = transcription
|
| 171 |
+
|
| 172 |
+
return batch
|
| 173 |
+
|
| 174 |
+
ds = ds.map(prepare_dataset, desc="preprocess dataset")
|
| 175 |
+
```
|
| 176 |
+
|
| 177 |
## Dataset Creation
|
| 178 |
|
| 179 |
### Curation Rationale
|
dataset_script.py
CHANGED
|
@@ -82,9 +82,9 @@ class CommonVoice(datasets.GeneratorBasedBuilder):
|
|
| 82 |
release_date=STATS["date"],
|
| 83 |
num_clips=lang_stats["clips"],
|
| 84 |
num_speakers=lang_stats["users"],
|
| 85 |
-
validated_hr=float(lang_stats["validHrs"]),
|
| 86 |
-
total_hr=float(lang_stats["totalHrs"]),
|
| 87 |
-
size_bytes=int(lang_stats["size"]),
|
| 88 |
)
|
| 89 |
for lang, lang_stats in STATS["locales"].items()
|
| 90 |
]
|
|
|
|
| 82 |
release_date=STATS["date"],
|
| 83 |
num_clips=lang_stats["clips"],
|
| 84 |
num_speakers=lang_stats["users"],
|
| 85 |
+
validated_hr=float(lang_stats["validHrs"]) if lang_stats["validHrs"] else None,
|
| 86 |
+
total_hr=float(lang_stats["totalHrs"]) if lang_stats["totalHrs"] else None,
|
| 87 |
+
size_bytes=int(lang_stats["size"]) if lang_stats["size"] else None,
|
| 88 |
)
|
| 89 |
for lang, lang_stats in STATS["locales"].items()
|
| 90 |
]
|
generate_datasets.py
CHANGED
|
@@ -49,6 +49,11 @@ VERSIONS = [
|
|
| 49 |
"name": "common_voice_9_0",
|
| 50 |
"release": "cv-corpus-9.0-2022-04-27",
|
| 51 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
]
|
| 53 |
|
| 54 |
|
|
|
|
| 49 |
"name": "common_voice_9_0",
|
| 50 |
"release": "cv-corpus-9.0-2022-04-27",
|
| 51 |
},
|
| 52 |
+
{
|
| 53 |
+
"semver": "10.0.0",
|
| 54 |
+
"name": "common_voice_10_0",
|
| 55 |
+
"release": "cv-corpus-10.0-2022-07-04",
|
| 56 |
+
},
|
| 57 |
]
|
| 58 |
|
| 59 |
|
languages.ftl
CHANGED
|
@@ -29,6 +29,7 @@ da = Danish
|
|
| 29 |
de = German
|
| 30 |
dsb = Sorbian, Lower
|
| 31 |
dv = Dhivehi
|
|
|
|
| 32 |
el = Greek
|
| 33 |
en = English
|
| 34 |
eo = Esperanto
|
|
@@ -97,12 +98,16 @@ my = Burmese
|
|
| 97 |
myv = Erzya
|
| 98 |
nan-tw = Taiwanese (Minnan)
|
| 99 |
nb-NO = Norwegian Bokmål
|
|
|
|
| 100 |
ne-NP = Nepali
|
| 101 |
nia = Nias
|
| 102 |
nl = Dutch
|
| 103 |
nn-NO = Norwegian Nynorsk
|
|
|
|
|
|
|
| 104 |
nyn = Runyankole
|
| 105 |
oc = Occitan
|
|
|
|
| 106 |
or = Odia
|
| 107 |
pa-IN = Punjabi
|
| 108 |
pap-AW = Papiamento (Aruba)
|
|
@@ -128,6 +133,8 @@ sl = Slovenian
|
|
| 128 |
so = Somali
|
| 129 |
sq = Albanian
|
| 130 |
sr = Serbian
|
|
|
|
|
|
|
| 131 |
sv-SE = Swedish
|
| 132 |
sw = Swahili
|
| 133 |
syr = Syriac
|
|
@@ -139,8 +146,10 @@ ti = Tigrinya
|
|
| 139 |
tig = Tigre
|
| 140 |
tk = Turkmen
|
| 141 |
tl = Tagalog
|
|
|
|
| 142 |
tok = Toki Pona
|
| 143 |
tr = Turkish
|
|
|
|
| 144 |
tt = Tatar
|
| 145 |
tw = Twi
|
| 146 |
ty = Tahitian
|
|
@@ -150,12 +159,15 @@ ug = Uyghur
|
|
| 150 |
uk = Ukrainian
|
| 151 |
ur = Urdu
|
| 152 |
uz = Uzbek
|
|
|
|
| 153 |
vec = Venetian
|
| 154 |
vi = Vietnamese
|
| 155 |
vot = Votic
|
|
|
|
| 156 |
yi = Yiddish
|
| 157 |
yo = Yoruba
|
| 158 |
yue = Cantonese
|
| 159 |
zh-CN = Chinese (China)
|
| 160 |
zh-HK = Chinese (Hong Kong)
|
| 161 |
-
zh-TW = Chinese (Taiwan)
|
|
|
|
|
|
| 29 |
de = German
|
| 30 |
dsb = Sorbian, Lower
|
| 31 |
dv = Dhivehi
|
| 32 |
+
dyu = Dioula
|
| 33 |
el = Greek
|
| 34 |
en = English
|
| 35 |
eo = Esperanto
|
|
|
|
| 98 |
myv = Erzya
|
| 99 |
nan-tw = Taiwanese (Minnan)
|
| 100 |
nb-NO = Norwegian Bokmål
|
| 101 |
+
nd = IsiNdebele (North)
|
| 102 |
ne-NP = Nepali
|
| 103 |
nia = Nias
|
| 104 |
nl = Dutch
|
| 105 |
nn-NO = Norwegian Nynorsk
|
| 106 |
+
nr = IsiNdebele (South)
|
| 107 |
+
nso = Northern Sotho
|
| 108 |
nyn = Runyankole
|
| 109 |
oc = Occitan
|
| 110 |
+
om = Afaan Ormoo
|
| 111 |
or = Odia
|
| 112 |
pa-IN = Punjabi
|
| 113 |
pap-AW = Papiamento (Aruba)
|
|
|
|
| 133 |
so = Somali
|
| 134 |
sq = Albanian
|
| 135 |
sr = Serbian
|
| 136 |
+
ss = Siswati
|
| 137 |
+
st = Southern Sotho
|
| 138 |
sv-SE = Swedish
|
| 139 |
sw = Swahili
|
| 140 |
syr = Syriac
|
|
|
|
| 146 |
tig = Tigre
|
| 147 |
tk = Turkmen
|
| 148 |
tl = Tagalog
|
| 149 |
+
tn = Setswana
|
| 150 |
tok = Toki Pona
|
| 151 |
tr = Turkish
|
| 152 |
+
ts = Xitsonga
|
| 153 |
tt = Tatar
|
| 154 |
tw = Twi
|
| 155 |
ty = Tahitian
|
|
|
|
| 159 |
uk = Ukrainian
|
| 160 |
ur = Urdu
|
| 161 |
uz = Uzbek
|
| 162 |
+
ve = Tshivenda
|
| 163 |
vec = Venetian
|
| 164 |
vi = Vietnamese
|
| 165 |
vot = Votic
|
| 166 |
+
xh = Xhosa
|
| 167 |
yi = Yiddish
|
| 168 |
yo = Yoruba
|
| 169 |
yue = Cantonese
|
| 170 |
zh-CN = Chinese (China)
|
| 171 |
zh-HK = Chinese (Hong Kong)
|
| 172 |
+
zh-TW = Chinese (Taiwan)
|
| 173 |
+
zu = Zulu
|
publish.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from huggingface_hub import create_repo
|
| 2 |
+
|
| 3 |
+
create_repo("mozilla-foundation/common_voice_10_0", repo_type="dataset")
|
test.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
from datasets import load_dataset
|
| 2 |
|
| 3 |
-
dataset = load_dataset("
|
| 4 |
print(dataset)
|
| 5 |
print(dataset[100])
|
|
|
|
| 1 |
from datasets import load_dataset
|
| 2 |
|
| 3 |
+
dataset = load_dataset("mozilla-foundation/common_voice_10_0", "et", split="test", use_auth_token=True)
|
| 4 |
print(dataset)
|
| 5 |
print(dataset[100])
|